{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 65, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -0.9647127985954285, "debug/policy_chosen_logps": -123.97367858886719, "debug/policy_rejected_logits": -1.31121826171875, "debug/policy_rejected_logps": -241.58673095703125, "debug/reference_chosen_logps": -123.97367858886719, "debug/reference_rejected_logps": -241.58673095703125, "epoch": 0.015384615384615385, "grad_norm": 39.79503528028501, "learning_rate": 1e-06, "logits/chosen": -0.9647127985954285, "logits/rejected": -1.31121826171875, "logps/chosen": -123.97367858886719, "logps/rejected": -241.58673095703125, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -0.9786798357963562, "debug/policy_chosen_logps": -165.07327270507812, "debug/policy_rejected_logits": -1.0510333776474, "debug/policy_rejected_logps": -217.42408752441406, "debug/reference_chosen_logps": -164.0435333251953, "debug/reference_rejected_logps": -214.95114135742188, "epoch": 0.03076923076923077, "grad_norm": 22.902488260908584, "learning_rate": 1e-06, "logits/chosen": -0.9786798357963562, "logits/rejected": -1.0510333776474, "logps/chosen": -165.07327270507812, "logps/rejected": -217.42408752441406, "loss": 0.4952, "rewards/accuracies": 0.625, "rewards/chosen": -0.010297326371073723, "rewards/margins": 0.014432210475206375, "rewards/rejected": -0.024729536846280098, "step": 2 }, { "debug/policy_chosen_logits": -0.8966987729072571, "debug/policy_chosen_logps": -138.84356689453125, "debug/policy_rejected_logits": -1.1685205698013306, "debug/policy_rejected_logps": -182.7120361328125, "debug/reference_chosen_logps": -136.04843139648438, "debug/reference_rejected_logps": -179.8273162841797, "epoch": 0.046153846153846156, "grad_norm": 25.87550324450751, "learning_rate": 1e-06, "logits/chosen": -0.8966987729072571, "logits/rejected": -1.1685205698013306, "logps/chosen": -138.84356689453125, "logps/rejected": -182.7120361328125, "loss": 0.4915, "rewards/accuracies": 0.5, "rewards/chosen": -0.0279513169080019, "rewards/margins": 0.000895805424079299, "rewards/rejected": -0.028847120702266693, "step": 3 }, { "debug/policy_chosen_logits": -0.8349865078926086, "debug/policy_chosen_logps": -159.051025390625, "debug/policy_rejected_logits": -1.2308893203735352, "debug/policy_rejected_logps": -232.80641174316406, "debug/reference_chosen_logps": -156.86355590820312, "debug/reference_rejected_logps": -225.85772705078125, "epoch": 0.06153846153846154, "grad_norm": 24.876106367847708, "learning_rate": 1e-06, "logits/chosen": -0.8349865078926086, "logits/rejected": -1.2308893203735352, "logps/chosen": -159.051025390625, "logps/rejected": -232.80641174316406, "loss": 0.4687, "rewards/accuracies": 0.875, "rewards/chosen": -0.021874800324440002, "rewards/margins": 0.047611966729164124, "rewards/rejected": -0.06948676705360413, "step": 4 }, { "debug/policy_chosen_logits": -1.0629841089248657, "debug/policy_chosen_logps": -174.21273803710938, "debug/policy_rejected_logits": -1.1589072942733765, "debug/policy_rejected_logps": -187.26315307617188, "debug/reference_chosen_logps": -169.4216766357422, "debug/reference_rejected_logps": -179.6783905029297, "epoch": 0.07692307692307693, "grad_norm": 18.047345529662877, "learning_rate": 1e-06, "logits/chosen": -1.0629841089248657, "logits/rejected": -1.1589072942733765, "logps/chosen": -174.21273803710938, "logps/rejected": -187.26315307617188, "loss": 0.4648, "rewards/accuracies": 0.75, "rewards/chosen": -0.0479106679558754, "rewards/margins": 0.027937039732933044, "rewards/rejected": -0.07584770768880844, "step": 5 }, { "debug/policy_chosen_logits": -0.8613454699516296, "debug/policy_chosen_logps": -139.91114807128906, "debug/policy_rejected_logits": -1.2306559085845947, "debug/policy_rejected_logps": -241.17236328125, "debug/reference_chosen_logps": -137.893310546875, "debug/reference_rejected_logps": -228.43502807617188, "epoch": 0.09230769230769231, "grad_norm": 36.455684691726056, "learning_rate": 1e-06, "logits/chosen": -0.8613454699516296, "logits/rejected": -1.2306559085845947, "logps/chosen": -139.91114807128906, "logps/rejected": -241.17236328125, "loss": 0.4429, "rewards/accuracies": 0.875, "rewards/chosen": -0.02017822116613388, "rewards/margins": 0.10719531774520874, "rewards/rejected": -0.12737354636192322, "step": 6 }, { "debug/policy_chosen_logits": -0.8777637481689453, "debug/policy_chosen_logps": -144.99093627929688, "debug/policy_rejected_logits": -1.185102105140686, "debug/policy_rejected_logps": -223.3565673828125, "debug/reference_chosen_logps": -140.28123474121094, "debug/reference_rejected_logps": -208.99623107910156, "epoch": 0.1076923076923077, "grad_norm": 16.74682782662271, "learning_rate": 1e-06, "logits/chosen": -0.8777637481689453, "logits/rejected": -1.185102105140686, "logps/chosen": -144.99093627929688, "logps/rejected": -223.3565673828125, "loss": 0.4384, "rewards/accuracies": 0.875, "rewards/chosen": -0.047096967697143555, "rewards/margins": 0.09650653600692749, "rewards/rejected": -0.14360350370407104, "step": 7 }, { "debug/policy_chosen_logits": -0.8900777697563171, "debug/policy_chosen_logps": -149.57913208007812, "debug/policy_rejected_logits": -1.3294610977172852, "debug/policy_rejected_logps": -239.15399169921875, "debug/reference_chosen_logps": -147.19287109375, "debug/reference_rejected_logps": -224.43423461914062, "epoch": 0.12307692307692308, "grad_norm": 16.007294862601125, "learning_rate": 1e-06, "logits/chosen": -0.8900777697563171, "logits/rejected": -1.3294610977172852, "logps/chosen": -149.57913208007812, "logps/rejected": -239.15399169921875, "loss": 0.4338, "rewards/accuracies": 0.75, "rewards/chosen": -0.023862790316343307, "rewards/margins": 0.12333479523658752, "rewards/rejected": -0.14719758927822113, "step": 8 }, { "debug/policy_chosen_logits": -0.9098005890846252, "debug/policy_chosen_logps": -170.28668212890625, "debug/policy_rejected_logits": -1.127171516418457, "debug/policy_rejected_logps": -225.05426025390625, "debug/reference_chosen_logps": -168.51516723632812, "debug/reference_rejected_logps": -209.784423828125, "epoch": 0.13846153846153847, "grad_norm": 19.203565972051226, "learning_rate": 1e-06, "logits/chosen": -0.9098005890846252, "logits/rejected": -1.127171516418457, "logps/chosen": -170.28668212890625, "logps/rejected": -225.05426025390625, "loss": 0.4492, "rewards/accuracies": 0.75, "rewards/chosen": -0.01771523430943489, "rewards/margins": 0.13498306274414062, "rewards/rejected": -0.1526983082294464, "step": 9 }, { "debug/policy_chosen_logits": -0.965919017791748, "debug/policy_chosen_logps": -147.7630615234375, "debug/policy_rejected_logits": -1.2619919776916504, "debug/policy_rejected_logps": -268.1822509765625, "debug/reference_chosen_logps": -151.75869750976562, "debug/reference_rejected_logps": -240.78182983398438, "epoch": 0.15384615384615385, "grad_norm": 20.990932451749828, "learning_rate": 1e-06, "logits/chosen": -0.965919017791748, "logits/rejected": -1.2619919776916504, "logps/chosen": -147.7630615234375, "logps/rejected": -268.1822509765625, "loss": 0.4045, "rewards/accuracies": 0.875, "rewards/chosen": 0.03995640575885773, "rewards/margins": 0.3139604926109314, "rewards/rejected": -0.2740040719509125, "step": 10 }, { "debug/policy_chosen_logits": -0.9731394648551941, "debug/policy_chosen_logps": -130.08078002929688, "debug/policy_rejected_logits": -1.357861042022705, "debug/policy_rejected_logps": -249.02049255371094, "debug/reference_chosen_logps": -137.5183868408203, "debug/reference_rejected_logps": -227.92147827148438, "epoch": 0.16923076923076924, "grad_norm": 13.612452592281528, "learning_rate": 1e-06, "logits/chosen": -0.9731394648551941, "logits/rejected": -1.357861042022705, "logps/chosen": -130.08078002929688, "logps/rejected": -249.02049255371094, "loss": 0.3732, "rewards/accuracies": 0.75, "rewards/chosen": 0.0743759498000145, "rewards/margins": 0.2853661775588989, "rewards/rejected": -0.21099020540714264, "step": 11 }, { "debug/policy_chosen_logits": -0.8728067278862, "debug/policy_chosen_logps": -162.35244750976562, "debug/policy_rejected_logits": -1.2558830976486206, "debug/policy_rejected_logps": -223.6861572265625, "debug/reference_chosen_logps": -168.2450714111328, "debug/reference_rejected_logps": -204.71609497070312, "epoch": 0.18461538461538463, "grad_norm": 21.002799906546286, "learning_rate": 1e-06, "logits/chosen": -0.8728067278862, "logits/rejected": -1.2558830976486206, "logps/chosen": -162.35244750976562, "logps/rejected": -223.6861572265625, "loss": 0.4358, "rewards/accuracies": 0.625, "rewards/chosen": 0.05892624333500862, "rewards/margins": 0.24862679839134216, "rewards/rejected": -0.18970054388046265, "step": 12 }, { "debug/policy_chosen_logits": -0.9268569946289062, "debug/policy_chosen_logps": -150.27801513671875, "debug/policy_rejected_logits": -1.261599063873291, "debug/policy_rejected_logps": -236.6121826171875, "debug/reference_chosen_logps": -162.22314453125, "debug/reference_rejected_logps": -217.59811401367188, "epoch": 0.2, "grad_norm": 24.078493119789076, "learning_rate": 1e-06, "logits/chosen": -0.9268569946289062, "logits/rejected": -1.261599063873291, "logps/chosen": -150.27801513671875, "logps/rejected": -236.6121826171875, "loss": 0.3834, "rewards/accuracies": 0.875, "rewards/chosen": 0.11945129930973053, "rewards/margins": 0.3095918893814087, "rewards/rejected": -0.19014061987400055, "step": 13 }, { "debug/policy_chosen_logits": -0.9981083869934082, "debug/policy_chosen_logps": -144.0004119873047, "debug/policy_rejected_logits": -1.0386605262756348, "debug/policy_rejected_logps": -212.8006134033203, "debug/reference_chosen_logps": -150.2911376953125, "debug/reference_rejected_logps": -200.33120727539062, "epoch": 0.2153846153846154, "grad_norm": 29.26495946304361, "learning_rate": 1e-06, "logits/chosen": -0.9981083869934082, "logits/rejected": -1.0386605262756348, "logps/chosen": -144.0004119873047, "logps/rejected": -212.8006134033203, "loss": 0.3987, "rewards/accuracies": 0.625, "rewards/chosen": 0.06290718913078308, "rewards/margins": 0.18760129809379578, "rewards/rejected": -0.12469412386417389, "step": 14 }, { "debug/policy_chosen_logits": -0.9750750660896301, "debug/policy_chosen_logps": -122.31702423095703, "debug/policy_rejected_logits": -1.2086548805236816, "debug/policy_rejected_logps": -278.26422119140625, "debug/reference_chosen_logps": -133.43157958984375, "debug/reference_rejected_logps": -251.7899932861328, "epoch": 0.23076923076923078, "grad_norm": 14.944920276792743, "learning_rate": 1e-06, "logits/chosen": -0.9750750660896301, "logits/rejected": -1.2086548805236816, "logps/chosen": -122.31702423095703, "logps/rejected": -278.26422119140625, "loss": 0.374, "rewards/accuracies": 0.875, "rewards/chosen": 0.11114557087421417, "rewards/margins": 0.3758879601955414, "rewards/rejected": -0.264742374420166, "step": 15 }, { "debug/policy_chosen_logits": -0.957609236240387, "debug/policy_chosen_logps": -118.1535873413086, "debug/policy_rejected_logits": -1.3453254699707031, "debug/policy_rejected_logps": -242.0065155029297, "debug/reference_chosen_logps": -128.71836853027344, "debug/reference_rejected_logps": -214.23687744140625, "epoch": 0.24615384615384617, "grad_norm": 19.47775666648347, "learning_rate": 1e-06, "logits/chosen": -0.957609236240387, "logits/rejected": -1.3453254699707031, "logps/chosen": -118.1535873413086, "logps/rejected": -242.0065155029297, "loss": 0.3365, "rewards/accuracies": 1.0, "rewards/chosen": 0.10564778745174408, "rewards/margins": 0.38334423303604126, "rewards/rejected": -0.2776964604854584, "step": 16 }, { "debug/policy_chosen_logits": -1.0495269298553467, "debug/policy_chosen_logps": -159.84902954101562, "debug/policy_rejected_logits": -1.0688265562057495, "debug/policy_rejected_logps": -218.22579956054688, "debug/reference_chosen_logps": -161.792236328125, "debug/reference_rejected_logps": -215.38601684570312, "epoch": 0.26153846153846155, "grad_norm": 13.042142522379077, "learning_rate": 1e-06, "logits/chosen": -1.0495269298553467, "logits/rejected": -1.0688265562057495, "logps/chosen": -159.84902954101562, "logps/rejected": -218.22579956054688, "loss": 0.4045, "rewards/accuracies": 0.625, "rewards/chosen": 0.01943197287619114, "rewards/margins": 0.047829799354076385, "rewards/rejected": -0.028397828340530396, "step": 17 }, { "debug/policy_chosen_logits": -1.0508699417114258, "debug/policy_chosen_logps": -99.80723571777344, "debug/policy_rejected_logits": -1.388744592666626, "debug/policy_rejected_logps": -294.78753662109375, "debug/reference_chosen_logps": -107.44318389892578, "debug/reference_rejected_logps": -258.77783203125, "epoch": 0.27692307692307694, "grad_norm": 13.499335626092817, "learning_rate": 1e-06, "logits/chosen": -1.0508699417114258, "logits/rejected": -1.388744592666626, "logps/chosen": -99.80723571777344, "logps/rejected": -294.78753662109375, "loss": 0.4099, "rewards/accuracies": 1.0, "rewards/chosen": 0.07635952532291412, "rewards/margins": 0.4364564120769501, "rewards/rejected": -0.36009684205055237, "step": 18 }, { "debug/policy_chosen_logits": -0.9774982929229736, "debug/policy_chosen_logps": -113.48712158203125, "debug/policy_rejected_logits": -1.0137752294540405, "debug/policy_rejected_logps": -197.8218231201172, "debug/reference_chosen_logps": -125.36196899414062, "debug/reference_rejected_logps": -197.83901977539062, "epoch": 0.2923076923076923, "grad_norm": 46.028370062522505, "learning_rate": 1e-06, "logits/chosen": -0.9774982929229736, "logits/rejected": -1.0137752294540405, "logps/chosen": -113.48712158203125, "logps/rejected": -197.8218231201172, "loss": 0.4434, "rewards/accuracies": 0.5, "rewards/chosen": 0.1187485083937645, "rewards/margins": 0.11857648193836212, "rewards/rejected": 0.0001720339059829712, "step": 19 }, { "debug/policy_chosen_logits": -0.9281129837036133, "debug/policy_chosen_logps": -110.92404174804688, "debug/policy_rejected_logits": -1.2334582805633545, "debug/policy_rejected_logps": -282.22894287109375, "debug/reference_chosen_logps": -127.69793701171875, "debug/reference_rejected_logps": -265.60455322265625, "epoch": 0.3076923076923077, "grad_norm": 30.786176694930152, "learning_rate": 1e-06, "logits/chosen": -0.9281129837036133, "logits/rejected": -1.2334582805633545, "logps/chosen": -110.92404174804688, "logps/rejected": -282.22894287109375, "loss": 0.3952, "rewards/accuracies": 1.0, "rewards/chosen": 0.16773885488510132, "rewards/margins": 0.33398276567459106, "rewards/rejected": -0.16624392569065094, "step": 20 }, { "debug/policy_chosen_logits": -1.0489017963409424, "debug/policy_chosen_logps": -142.0389404296875, "debug/policy_rejected_logits": -1.2170113325119019, "debug/policy_rejected_logps": -261.4895324707031, "debug/reference_chosen_logps": -153.43414306640625, "debug/reference_rejected_logps": -249.89022827148438, "epoch": 0.3230769230769231, "grad_norm": 14.245651886832839, "learning_rate": 1e-06, "logits/chosen": -1.0489017963409424, "logits/rejected": -1.2170113325119019, "logps/chosen": -142.0389404296875, "logps/rejected": -261.4895324707031, "loss": 0.3621, "rewards/accuracies": 0.625, "rewards/chosen": 0.11395198851823807, "rewards/margins": 0.22994519770145416, "rewards/rejected": -0.11599321663379669, "step": 21 }, { "debug/policy_chosen_logits": -1.2051200866699219, "debug/policy_chosen_logps": -154.2470703125, "debug/policy_rejected_logits": -1.4243721961975098, "debug/policy_rejected_logps": -228.190673828125, "debug/reference_chosen_logps": -170.4795379638672, "debug/reference_rejected_logps": -220.2373809814453, "epoch": 0.3384615384615385, "grad_norm": 20.829794058513283, "learning_rate": 1e-06, "logits/chosen": -1.2051200866699219, "logits/rejected": -1.4243721961975098, "logps/chosen": -154.2470703125, "logps/rejected": -228.190673828125, "loss": 0.3421, "rewards/accuracies": 0.5, "rewards/chosen": 0.16232462227344513, "rewards/margins": 0.24185749888420105, "rewards/rejected": -0.07953288406133652, "step": 22 }, { "debug/policy_chosen_logits": -1.2017217874526978, "debug/policy_chosen_logps": -117.44737243652344, "debug/policy_rejected_logits": -1.4578936100006104, "debug/policy_rejected_logps": -213.82699584960938, "debug/reference_chosen_logps": -135.86666870117188, "debug/reference_rejected_logps": -211.36785888671875, "epoch": 0.35384615384615387, "grad_norm": 17.954045455637697, "learning_rate": 1e-06, "logits/chosen": -1.2017217874526978, "logits/rejected": -1.4578936100006104, "logps/chosen": -117.44737243652344, "logps/rejected": -213.82699584960938, "loss": 0.4522, "rewards/accuracies": 0.5, "rewards/chosen": 0.184192955493927, "rewards/margins": 0.20878440141677856, "rewards/rejected": -0.02459145151078701, "step": 23 }, { "debug/policy_chosen_logits": -0.8499487638473511, "debug/policy_chosen_logps": -128.18792724609375, "debug/policy_rejected_logits": -1.0719280242919922, "debug/policy_rejected_logps": -215.59121704101562, "debug/reference_chosen_logps": -144.73577880859375, "debug/reference_rejected_logps": -216.82037353515625, "epoch": 0.36923076923076925, "grad_norm": 26.76844664947462, "learning_rate": 1e-06, "logits/chosen": -0.8499487638473511, "logits/rejected": -1.0719280242919922, "logps/chosen": -128.18792724609375, "logps/rejected": -215.59121704101562, "loss": 0.3353, "rewards/accuracies": 0.375, "rewards/chosen": 0.16547852754592896, "rewards/margins": 0.1531868278980255, "rewards/rejected": 0.012291695922613144, "step": 24 }, { "debug/policy_chosen_logits": -1.1630948781967163, "debug/policy_chosen_logps": -105.56856536865234, "debug/policy_rejected_logits": -1.32340669631958, "debug/policy_rejected_logps": -221.09933471679688, "debug/reference_chosen_logps": -121.05016326904297, "debug/reference_rejected_logps": -209.24151611328125, "epoch": 0.38461538461538464, "grad_norm": 21.670904410125967, "learning_rate": 1e-06, "logits/chosen": -1.1630948781967163, "logits/rejected": -1.32340669631958, "logps/chosen": -105.56856536865234, "logps/rejected": -221.09933471679688, "loss": 0.3962, "rewards/accuracies": 1.0, "rewards/chosen": 0.15481595695018768, "rewards/margins": 0.2733941078186035, "rewards/rejected": -0.11857814341783524, "step": 25 }, { "debug/policy_chosen_logits": -0.9490920901298523, "debug/policy_chosen_logps": -113.62861633300781, "debug/policy_rejected_logits": -1.2542915344238281, "debug/policy_rejected_logps": -293.95989990234375, "debug/reference_chosen_logps": -127.203857421875, "debug/reference_rejected_logps": -275.20550537109375, "epoch": 0.4, "grad_norm": 15.696719670073598, "learning_rate": 1e-06, "logits/chosen": -0.9490920901298523, "logits/rejected": -1.2542915344238281, "logps/chosen": -113.62861633300781, "logps/rejected": -293.95989990234375, "loss": 0.3798, "rewards/accuracies": 1.0, "rewards/chosen": 0.135752335190773, "rewards/margins": 0.3232962489128113, "rewards/rejected": -0.18754389882087708, "step": 26 }, { "debug/policy_chosen_logits": -0.9385436177253723, "debug/policy_chosen_logps": -133.846435546875, "debug/policy_rejected_logits": -1.137320637702942, "debug/policy_rejected_logps": -276.10906982421875, "debug/reference_chosen_logps": -149.49008178710938, "debug/reference_rejected_logps": -256.6170654296875, "epoch": 0.4153846153846154, "grad_norm": 16.356437028347088, "learning_rate": 1e-06, "logits/chosen": -0.9385436177253723, "logits/rejected": -1.137320637702942, "logps/chosen": -133.846435546875, "logps/rejected": -276.10906982421875, "loss": 0.3314, "rewards/accuracies": 0.875, "rewards/chosen": 0.15643641352653503, "rewards/margins": 0.35135647654533386, "rewards/rejected": -0.19492007791996002, "step": 27 }, { "debug/policy_chosen_logits": -1.0301445722579956, "debug/policy_chosen_logps": -127.61602783203125, "debug/policy_rejected_logits": -1.2776472568511963, "debug/policy_rejected_logps": -217.84393310546875, "debug/reference_chosen_logps": -144.73550415039062, "debug/reference_rejected_logps": -202.54354858398438, "epoch": 0.4307692307692308, "grad_norm": 21.190086495233416, "learning_rate": 1e-06, "logits/chosen": -1.0301445722579956, "logits/rejected": -1.2776472568511963, "logps/chosen": -127.61602783203125, "logps/rejected": -217.84393310546875, "loss": 0.3107, "rewards/accuracies": 0.875, "rewards/chosen": 0.1711946725845337, "rewards/margins": 0.32419848442077637, "rewards/rejected": -0.15300381183624268, "step": 28 }, { "debug/policy_chosen_logits": -1.0022560358047485, "debug/policy_chosen_logps": -172.74249267578125, "debug/policy_rejected_logits": -1.3468449115753174, "debug/policy_rejected_logps": -227.54037475585938, "debug/reference_chosen_logps": -177.01214599609375, "debug/reference_rejected_logps": -214.42225646972656, "epoch": 0.4461538461538462, "grad_norm": 25.851933699207674, "learning_rate": 1e-06, "logits/chosen": -1.0022560358047485, "logits/rejected": -1.3468449115753174, "logps/chosen": -172.74249267578125, "logps/rejected": -227.54037475585938, "loss": 0.3899, "rewards/accuracies": 0.625, "rewards/chosen": 0.042696550488471985, "rewards/margins": 0.17387771606445312, "rewards/rejected": -0.13118118047714233, "step": 29 }, { "debug/policy_chosen_logits": -1.0099124908447266, "debug/policy_chosen_logps": -95.54536437988281, "debug/policy_rejected_logits": -1.2413769960403442, "debug/policy_rejected_logps": -189.86050415039062, "debug/reference_chosen_logps": -113.0428695678711, "debug/reference_rejected_logps": -186.69793701171875, "epoch": 0.46153846153846156, "grad_norm": 13.611136093486117, "learning_rate": 1e-06, "logits/chosen": -1.0099124908447266, "logits/rejected": -1.2413769960403442, "logps/chosen": -95.54536437988281, "logps/rejected": -189.86050415039062, "loss": 0.3601, "rewards/accuracies": 0.75, "rewards/chosen": 0.17497505247592926, "rewards/margins": 0.2066008746623993, "rewards/rejected": -0.03162582963705063, "step": 30 }, { "debug/policy_chosen_logits": -1.0692760944366455, "debug/policy_chosen_logps": -105.94851684570312, "debug/policy_rejected_logits": -1.3923370838165283, "debug/policy_rejected_logps": -220.22308349609375, "debug/reference_chosen_logps": -127.50257873535156, "debug/reference_rejected_logps": -207.05284118652344, "epoch": 0.47692307692307695, "grad_norm": 15.69238072866507, "learning_rate": 1e-06, "logits/chosen": -1.0692760944366455, "logits/rejected": -1.3923370838165283, "logps/chosen": -105.94851684570312, "logps/rejected": -220.22308349609375, "loss": 0.365, "rewards/accuracies": 0.75, "rewards/chosen": 0.21554069221019745, "rewards/margins": 0.3472433090209961, "rewards/rejected": -0.13170258700847626, "step": 31 }, { "debug/policy_chosen_logits": -0.9629390835762024, "debug/policy_chosen_logps": -116.25215148925781, "debug/policy_rejected_logits": -1.2167094945907593, "debug/policy_rejected_logps": -292.70867919921875, "debug/reference_chosen_logps": -132.44338989257812, "debug/reference_rejected_logps": -275.3009033203125, "epoch": 0.49230769230769234, "grad_norm": 11.458704557453553, "learning_rate": 1e-06, "logits/chosen": -0.9629390835762024, "logits/rejected": -1.2167094945907593, "logps/chosen": -116.25215148925781, "logps/rejected": -292.70867919921875, "loss": 0.3086, "rewards/accuracies": 0.75, "rewards/chosen": 0.16191250085830688, "rewards/margins": 0.33599066734313965, "rewards/rejected": -0.17407818138599396, "step": 32 }, { "debug/policy_chosen_logits": -1.1342741250991821, "debug/policy_chosen_logps": -88.70211791992188, "debug/policy_rejected_logits": -1.3675200939178467, "debug/policy_rejected_logps": -312.9244384765625, "debug/reference_chosen_logps": -107.44647216796875, "debug/reference_rejected_logps": -275.48333740234375, "epoch": 0.5076923076923077, "grad_norm": 19.428633515391827, "learning_rate": 1e-06, "logits/chosen": -1.1342741250991821, "logits/rejected": -1.3675200939178467, "logps/chosen": -88.70211791992188, "logps/rejected": -312.9244384765625, "loss": 0.3464, "rewards/accuracies": 1.0, "rewards/chosen": 0.1874435693025589, "rewards/margins": 0.5618546009063721, "rewards/rejected": -0.37441104650497437, "step": 33 }, { "debug/policy_chosen_logits": -1.0474224090576172, "debug/policy_chosen_logps": -125.40650177001953, "debug/policy_rejected_logits": -1.4361001253128052, "debug/policy_rejected_logps": -245.4980010986328, "debug/reference_chosen_logps": -147.00311279296875, "debug/reference_rejected_logps": -218.89697265625, "epoch": 0.5230769230769231, "grad_norm": 18.811960048896026, "learning_rate": 1e-06, "logits/chosen": -1.0474224090576172, "logits/rejected": -1.4361001253128052, "logps/chosen": -125.40650177001953, "logps/rejected": -245.4980010986328, "loss": 0.3733, "rewards/accuracies": 0.875, "rewards/chosen": 0.21596619486808777, "rewards/margins": 0.4819765090942383, "rewards/rejected": -0.2660103142261505, "step": 34 }, { "debug/policy_chosen_logits": -1.0789586305618286, "debug/policy_chosen_logps": -122.23463439941406, "debug/policy_rejected_logits": -1.1521984338760376, "debug/policy_rejected_logps": -190.08827209472656, "debug/reference_chosen_logps": -136.9556427001953, "debug/reference_rejected_logps": -185.76451110839844, "epoch": 0.5384615384615384, "grad_norm": 33.304257629648056, "learning_rate": 1e-06, "logits/chosen": -1.0789586305618286, "logits/rejected": -1.1521984338760376, "logps/chosen": -122.23463439941406, "logps/rejected": -190.08827209472656, "loss": 0.4049, "rewards/accuracies": 0.75, "rewards/chosen": 0.147210031747818, "rewards/margins": 0.19044768810272217, "rewards/rejected": -0.04323763772845268, "step": 35 }, { "debug/policy_chosen_logits": -1.091545581817627, "debug/policy_chosen_logps": -178.88722229003906, "debug/policy_rejected_logits": -1.202536940574646, "debug/policy_rejected_logps": -231.08212280273438, "debug/reference_chosen_logps": -194.953369140625, "debug/reference_rejected_logps": -223.31463623046875, "epoch": 0.5538461538461539, "grad_norm": 16.99572641358511, "learning_rate": 1e-06, "logits/chosen": -1.091545581817627, "logits/rejected": -1.202536940574646, "logps/chosen": -178.88722229003906, "logps/rejected": -231.08212280273438, "loss": 0.3911, "rewards/accuracies": 0.625, "rewards/chosen": 0.16066133975982666, "rewards/margins": 0.2383359968662262, "rewards/rejected": -0.07767465710639954, "step": 36 }, { "debug/policy_chosen_logits": -0.9485809206962585, "debug/policy_chosen_logps": -125.60317993164062, "debug/policy_rejected_logits": -1.328150749206543, "debug/policy_rejected_logps": -216.15621948242188, "debug/reference_chosen_logps": -145.79241943359375, "debug/reference_rejected_logps": -200.3070068359375, "epoch": 0.5692307692307692, "grad_norm": 18.787049743694737, "learning_rate": 1e-06, "logits/chosen": -0.9485809206962585, "logits/rejected": -1.328150749206543, "logps/chosen": -125.60317993164062, "logps/rejected": -216.15621948242188, "loss": 0.3548, "rewards/accuracies": 0.75, "rewards/chosen": 0.20189230144023895, "rewards/margins": 0.3603845238685608, "rewards/rejected": -0.15849220752716064, "step": 37 }, { "debug/policy_chosen_logits": -1.0845237970352173, "debug/policy_chosen_logps": -125.10261535644531, "debug/policy_rejected_logits": -1.2392336130142212, "debug/policy_rejected_logps": -180.54214477539062, "debug/reference_chosen_logps": -142.53285217285156, "debug/reference_rejected_logps": -181.42666625976562, "epoch": 0.5846153846153846, "grad_norm": 25.720524625538612, "learning_rate": 1e-06, "logits/chosen": -1.0845237970352173, "logits/rejected": -1.2392336130142212, "logps/chosen": -125.10261535644531, "logps/rejected": -180.54214477539062, "loss": 0.4502, "rewards/accuracies": 0.625, "rewards/chosen": 0.1743023693561554, "rewards/margins": 0.16545727849006653, "rewards/rejected": 0.008845105767250061, "step": 38 }, { "debug/policy_chosen_logits": -1.0929769277572632, "debug/policy_chosen_logps": -128.90899658203125, "debug/policy_rejected_logits": -1.1566870212554932, "debug/policy_rejected_logps": -268.0566101074219, "debug/reference_chosen_logps": -144.32101440429688, "debug/reference_rejected_logps": -261.6700744628906, "epoch": 0.6, "grad_norm": 24.861695719305146, "learning_rate": 1e-06, "logits/chosen": -1.0929769277572632, "logits/rejected": -1.1566870212554932, "logps/chosen": -128.90899658203125, "logps/rejected": -268.0566101074219, "loss": 0.4098, "rewards/accuracies": 0.875, "rewards/chosen": 0.15412017703056335, "rewards/margins": 0.21798546612262726, "rewards/rejected": -0.0638652890920639, "step": 39 }, { "debug/policy_chosen_logits": -1.0760107040405273, "debug/policy_chosen_logps": -103.31159210205078, "debug/policy_rejected_logits": -1.3527721166610718, "debug/policy_rejected_logps": -233.61349487304688, "debug/reference_chosen_logps": -116.12698364257812, "debug/reference_rejected_logps": -210.38319396972656, "epoch": 0.6153846153846154, "grad_norm": 21.27474329012195, "learning_rate": 1e-06, "logits/chosen": -1.0760107040405273, "logits/rejected": -1.3527721166610718, "logps/chosen": -103.31159210205078, "logps/rejected": -233.61349487304688, "loss": 0.356, "rewards/accuracies": 0.875, "rewards/chosen": 0.1281539350748062, "rewards/margins": 0.36045682430267334, "rewards/rejected": -0.23230290412902832, "step": 40 }, { "debug/policy_chosen_logits": -1.1966038942337036, "debug/policy_chosen_logps": -162.63662719726562, "debug/policy_rejected_logits": -1.3122248649597168, "debug/policy_rejected_logps": -289.13299560546875, "debug/reference_chosen_logps": -162.546875, "debug/reference_rejected_logps": -259.9058532714844, "epoch": 0.6307692307692307, "grad_norm": 23.60915267979043, "learning_rate": 1e-06, "logits/chosen": -1.1966038942337036, "logits/rejected": -1.3122248649597168, "logps/chosen": -162.63662719726562, "logps/rejected": -289.13299560546875, "loss": 0.3907, "rewards/accuracies": 0.75, "rewards/chosen": -0.0008975863456726074, "rewards/margins": 0.2913738787174225, "rewards/rejected": -0.2922714948654175, "step": 41 }, { "debug/policy_chosen_logits": -1.355502724647522, "debug/policy_chosen_logps": -116.44414520263672, "debug/policy_rejected_logits": -1.4048125743865967, "debug/policy_rejected_logps": -176.58692932128906, "debug/reference_chosen_logps": -120.18405151367188, "debug/reference_rejected_logps": -162.6604461669922, "epoch": 0.6461538461538462, "grad_norm": 26.61950044662249, "learning_rate": 1e-06, "logits/chosen": -1.355502724647522, "logits/rejected": -1.4048125743865967, "logps/chosen": -116.44414520263672, "logps/rejected": -176.58692932128906, "loss": 0.389, "rewards/accuracies": 0.75, "rewards/chosen": 0.037399038672447205, "rewards/margins": 0.17666393518447876, "rewards/rejected": -0.13926488161087036, "step": 42 }, { "debug/policy_chosen_logits": -0.9198251962661743, "debug/policy_chosen_logps": -138.879150390625, "debug/policy_rejected_logits": -1.1327546834945679, "debug/policy_rejected_logps": -246.67523193359375, "debug/reference_chosen_logps": -145.7458038330078, "debug/reference_rejected_logps": -225.77520751953125, "epoch": 0.6615384615384615, "grad_norm": 13.920738731758536, "learning_rate": 1e-06, "logits/chosen": -0.9198251962661743, "logits/rejected": -1.1327546834945679, "logps/chosen": -138.879150390625, "logps/rejected": -246.67523193359375, "loss": 0.401, "rewards/accuracies": 0.625, "rewards/chosen": 0.06866656243801117, "rewards/margins": 0.27766674757003784, "rewards/rejected": -0.20900021493434906, "step": 43 }, { "debug/policy_chosen_logits": -1.075502872467041, "debug/policy_chosen_logps": -138.92579650878906, "debug/policy_rejected_logits": -1.3321685791015625, "debug/policy_rejected_logps": -269.3880920410156, "debug/reference_chosen_logps": -149.0914306640625, "debug/reference_rejected_logps": -243.71829223632812, "epoch": 0.676923076923077, "grad_norm": 19.636655476760644, "learning_rate": 1e-06, "logits/chosen": -1.075502872467041, "logits/rejected": -1.3321685791015625, "logps/chosen": -138.92579650878906, "logps/rejected": -269.3880920410156, "loss": 0.3799, "rewards/accuracies": 0.75, "rewards/chosen": 0.10165630280971527, "rewards/margins": 0.3583540916442871, "rewards/rejected": -0.25669777393341064, "step": 44 }, { "debug/policy_chosen_logits": -1.0624967813491821, "debug/policy_chosen_logps": -123.77445220947266, "debug/policy_rejected_logits": -1.0901663303375244, "debug/policy_rejected_logps": -219.983642578125, "debug/reference_chosen_logps": -130.23268127441406, "debug/reference_rejected_logps": -215.2198944091797, "epoch": 0.6923076923076923, "grad_norm": 23.603940752463636, "learning_rate": 1e-06, "logits/chosen": -1.0624967813491821, "logits/rejected": -1.0901663303375244, "logps/chosen": -123.77445220947266, "logps/rejected": -219.983642578125, "loss": 0.4365, "rewards/accuracies": 0.5, "rewards/chosen": 0.0645822286605835, "rewards/margins": 0.1122196614742279, "rewards/rejected": -0.04763743281364441, "step": 45 }, { "debug/policy_chosen_logits": -0.9820321202278137, "debug/policy_chosen_logps": -145.89419555664062, "debug/policy_rejected_logits": -1.2636444568634033, "debug/policy_rejected_logps": -258.6197509765625, "debug/reference_chosen_logps": -162.72927856445312, "debug/reference_rejected_logps": -234.30699157714844, "epoch": 0.7076923076923077, "grad_norm": 17.15613483352223, "learning_rate": 1e-06, "logits/chosen": -0.9820321202278137, "logits/rejected": -1.2636444568634033, "logps/chosen": -145.89419555664062, "logps/rejected": -258.6197509765625, "loss": 0.3545, "rewards/accuracies": 0.875, "rewards/chosen": 0.1683509349822998, "rewards/margins": 0.41147857904434204, "rewards/rejected": -0.24312765896320343, "step": 46 }, { "debug/policy_chosen_logits": -0.9655563831329346, "debug/policy_chosen_logps": -147.6282501220703, "debug/policy_rejected_logits": -1.4386718273162842, "debug/policy_rejected_logps": -292.20941162109375, "debug/reference_chosen_logps": -167.51661682128906, "debug/reference_rejected_logps": -268.37603759765625, "epoch": 0.7230769230769231, "grad_norm": 10.311916929230629, "learning_rate": 1e-06, "logits/chosen": -0.9655563831329346, "logits/rejected": -1.4386718273162842, "logps/chosen": -147.6282501220703, "logps/rejected": -292.20941162109375, "loss": 0.3314, "rewards/accuracies": 0.875, "rewards/chosen": 0.19888359308242798, "rewards/margins": 0.43721747398376465, "rewards/rejected": -0.23833388090133667, "step": 47 }, { "debug/policy_chosen_logits": -1.0769726037979126, "debug/policy_chosen_logps": -138.5823211669922, "debug/policy_rejected_logits": -1.4088174104690552, "debug/policy_rejected_logps": -265.21136474609375, "debug/reference_chosen_logps": -154.25830078125, "debug/reference_rejected_logps": -245.86599731445312, "epoch": 0.7384615384615385, "grad_norm": 44.605855363152635, "learning_rate": 1e-06, "logits/chosen": -1.0769726037979126, "logits/rejected": -1.4088174104690552, "logps/chosen": -138.5823211669922, "logps/rejected": -265.21136474609375, "loss": 0.315, "rewards/accuracies": 0.875, "rewards/chosen": 0.15675979852676392, "rewards/margins": 0.3502136468887329, "rewards/rejected": -0.193453848361969, "step": 48 }, { "debug/policy_chosen_logits": -1.0038551092147827, "debug/policy_chosen_logps": -115.539794921875, "debug/policy_rejected_logits": -1.0347591638565063, "debug/policy_rejected_logps": -187.43521118164062, "debug/reference_chosen_logps": -129.97064208984375, "debug/reference_rejected_logps": -184.09988403320312, "epoch": 0.7538461538461538, "grad_norm": 32.371087784097746, "learning_rate": 1e-06, "logits/chosen": -1.0038551092147827, "logits/rejected": -1.0347591638565063, "logps/chosen": -115.539794921875, "logps/rejected": -187.43521118164062, "loss": 0.334, "rewards/accuracies": 0.75, "rewards/chosen": 0.1443084478378296, "rewards/margins": 0.17766161262989044, "rewards/rejected": -0.03335317596793175, "step": 49 }, { "debug/policy_chosen_logits": -1.0499004125595093, "debug/policy_chosen_logps": -130.71490478515625, "debug/policy_rejected_logits": -1.208733081817627, "debug/policy_rejected_logps": -235.5469970703125, "debug/reference_chosen_logps": -144.00558471679688, "debug/reference_rejected_logps": -219.49880981445312, "epoch": 0.7692307692307693, "grad_norm": 19.363793375530744, "learning_rate": 1e-06, "logits/chosen": -1.0499004125595093, "logits/rejected": -1.208733081817627, "logps/chosen": -130.71490478515625, "logps/rejected": -235.5469970703125, "loss": 0.3376, "rewards/accuracies": 0.75, "rewards/chosen": 0.13290682435035706, "rewards/margins": 0.2933886647224426, "rewards/rejected": -0.16048187017440796, "step": 50 }, { "debug/policy_chosen_logits": -1.1999765634536743, "debug/policy_chosen_logps": -112.4703598022461, "debug/policy_rejected_logits": -1.222855806350708, "debug/policy_rejected_logps": -211.74696350097656, "debug/reference_chosen_logps": -127.15155029296875, "debug/reference_rejected_logps": -198.1513671875, "epoch": 0.7846153846153846, "grad_norm": 15.135997979783829, "learning_rate": 1e-06, "logits/chosen": -1.1999765634536743, "logits/rejected": -1.222855806350708, "logps/chosen": -112.4703598022461, "logps/rejected": -211.74696350097656, "loss": 0.3792, "rewards/accuracies": 0.75, "rewards/chosen": 0.14681187272071838, "rewards/margins": 0.28276756405830383, "rewards/rejected": -0.13595569133758545, "step": 51 }, { "debug/policy_chosen_logits": -0.8939087986946106, "debug/policy_chosen_logps": -116.76834869384766, "debug/policy_rejected_logits": -1.1900181770324707, "debug/policy_rejected_logps": -213.6509246826172, "debug/reference_chosen_logps": -135.86460876464844, "debug/reference_rejected_logps": -203.9865264892578, "epoch": 0.8, "grad_norm": 37.3845253002307, "learning_rate": 1e-06, "logits/chosen": -0.8939087986946106, "logits/rejected": -1.1900181770324707, "logps/chosen": -116.76834869384766, "logps/rejected": -213.6509246826172, "loss": 0.3832, "rewards/accuracies": 1.0, "rewards/chosen": 0.19096264243125916, "rewards/margins": 0.28760671615600586, "rewards/rejected": -0.09664406627416611, "step": 52 }, { "debug/policy_chosen_logits": -0.9584690928459167, "debug/policy_chosen_logps": -175.17242431640625, "debug/policy_rejected_logits": -1.1710708141326904, "debug/policy_rejected_logps": -213.17564392089844, "debug/reference_chosen_logps": -179.64723205566406, "debug/reference_rejected_logps": -189.12155151367188, "epoch": 0.8153846153846154, "grad_norm": 62.40121544308884, "learning_rate": 1e-06, "logits/chosen": -0.9584690928459167, "logits/rejected": -1.1710708141326904, "logps/chosen": -175.17242431640625, "logps/rejected": -213.17564392089844, "loss": 0.3789, "rewards/accuracies": 0.75, "rewards/chosen": 0.04474814236164093, "rewards/margins": 0.28528910875320435, "rewards/rejected": -0.24054095149040222, "step": 53 }, { "debug/policy_chosen_logits": -1.1082603931427002, "debug/policy_chosen_logps": -179.68826293945312, "debug/policy_rejected_logits": -1.357857584953308, "debug/policy_rejected_logps": -226.95327758789062, "debug/reference_chosen_logps": -183.20611572265625, "debug/reference_rejected_logps": -199.65640258789062, "epoch": 0.8307692307692308, "grad_norm": 62.93829593264758, "learning_rate": 1e-06, "logits/chosen": -1.1082603931427002, "logits/rejected": -1.357857584953308, "logps/chosen": -179.68826293945312, "logps/rejected": -226.95327758789062, "loss": 0.4486, "rewards/accuracies": 0.875, "rewards/chosen": 0.03517843782901764, "rewards/margins": 0.30814701318740845, "rewards/rejected": -0.2729685604572296, "step": 54 }, { "debug/policy_chosen_logits": -1.084839940071106, "debug/policy_chosen_logps": -126.61346435546875, "debug/policy_rejected_logits": -1.0655025243759155, "debug/policy_rejected_logps": -227.86724853515625, "debug/reference_chosen_logps": -123.68621826171875, "debug/reference_rejected_logps": -213.22573852539062, "epoch": 0.8461538461538461, "grad_norm": 39.62365642421888, "learning_rate": 1e-06, "logits/chosen": -1.084839940071106, "logits/rejected": -1.0655025243759155, "logps/chosen": -126.61346435546875, "logps/rejected": -227.86724853515625, "loss": 0.3657, "rewards/accuracies": 0.625, "rewards/chosen": -0.02927243895828724, "rewards/margins": 0.11714262515306473, "rewards/rejected": -0.14641505479812622, "step": 55 }, { "debug/policy_chosen_logits": -1.1687896251678467, "debug/policy_chosen_logps": -122.57518005371094, "debug/policy_rejected_logits": -1.2633333206176758, "debug/policy_rejected_logps": -264.3393249511719, "debug/reference_chosen_logps": -136.72682189941406, "debug/reference_rejected_logps": -241.86154174804688, "epoch": 0.8615384615384616, "grad_norm": 47.1019013360206, "learning_rate": 1e-06, "logits/chosen": -1.1687896251678467, "logits/rejected": -1.2633333206176758, "logps/chosen": -122.57518005371094, "logps/rejected": -264.3393249511719, "loss": 0.3505, "rewards/accuracies": 0.75, "rewards/chosen": 0.14151641726493835, "rewards/margins": 0.3662940263748169, "rewards/rejected": -0.22477757930755615, "step": 56 }, { "debug/policy_chosen_logits": -1.1213276386260986, "debug/policy_chosen_logps": -144.16796875, "debug/policy_rejected_logits": -1.2798304557800293, "debug/policy_rejected_logps": -269.6147155761719, "debug/reference_chosen_logps": -158.32400512695312, "debug/reference_rejected_logps": -251.46096801757812, "epoch": 0.8769230769230769, "grad_norm": 12.111206423858098, "learning_rate": 1e-06, "logits/chosen": -1.1213276386260986, "logits/rejected": -1.2798304557800293, "logps/chosen": -144.16796875, "logps/rejected": -269.6147155761719, "loss": 0.3353, "rewards/accuracies": 0.875, "rewards/chosen": 0.14156028628349304, "rewards/margins": 0.3230975866317749, "rewards/rejected": -0.18153730034828186, "step": 57 }, { "debug/policy_chosen_logits": -1.0232670307159424, "debug/policy_chosen_logps": -121.2646484375, "debug/policy_rejected_logits": -1.2712284326553345, "debug/policy_rejected_logps": -220.20851135253906, "debug/reference_chosen_logps": -139.99087524414062, "debug/reference_rejected_logps": -208.25103759765625, "epoch": 0.8923076923076924, "grad_norm": 25.19320439679914, "learning_rate": 1e-06, "logits/chosen": -1.0232670307159424, "logits/rejected": -1.2712284326553345, "logps/chosen": -121.2646484375, "logps/rejected": -220.20851135253906, "loss": 0.3461, "rewards/accuracies": 1.0, "rewards/chosen": 0.18726232647895813, "rewards/margins": 0.3068370521068573, "rewards/rejected": -0.11957473307847977, "step": 58 }, { "debug/policy_chosen_logits": -1.0311870574951172, "debug/policy_chosen_logps": -136.0186767578125, "debug/policy_rejected_logits": -1.283647894859314, "debug/policy_rejected_logps": -244.9959259033203, "debug/reference_chosen_logps": -154.48251342773438, "debug/reference_rejected_logps": -233.20367431640625, "epoch": 0.9076923076923077, "grad_norm": 30.477284008078836, "learning_rate": 1e-06, "logits/chosen": -1.0311870574951172, "logits/rejected": -1.283647894859314, "logps/chosen": -136.0186767578125, "logps/rejected": -244.9959259033203, "loss": 0.3438, "rewards/accuracies": 0.75, "rewards/chosen": 0.18463847041130066, "rewards/margins": 0.30256104469299316, "rewards/rejected": -0.1179225891828537, "step": 59 }, { "debug/policy_chosen_logits": -1.2158256769180298, "debug/policy_chosen_logps": -166.81390380859375, "debug/policy_rejected_logits": -1.1210004091262817, "debug/policy_rejected_logps": -187.45571899414062, "debug/reference_chosen_logps": -173.96209716796875, "debug/reference_rejected_logps": -197.62841796875, "epoch": 0.9230769230769231, "grad_norm": 48.260243781408725, "learning_rate": 1e-06, "logits/chosen": -1.2158256769180298, "logits/rejected": -1.1210004091262817, "logps/chosen": -166.81390380859375, "logps/rejected": -187.45571899414062, "loss": 0.4069, "rewards/accuracies": 0.375, "rewards/chosen": 0.07148197293281555, "rewards/margins": -0.03024490550160408, "rewards/rejected": 0.10172686725854874, "step": 60 }, { "debug/policy_chosen_logits": -1.06016206741333, "debug/policy_chosen_logps": -114.04878234863281, "debug/policy_rejected_logits": -1.1642900705337524, "debug/policy_rejected_logps": -191.70645141601562, "debug/reference_chosen_logps": -141.2090606689453, "debug/reference_rejected_logps": -187.01593017578125, "epoch": 0.9384615384615385, "grad_norm": 39.967905719093444, "learning_rate": 1e-06, "logits/chosen": -1.06016206741333, "logits/rejected": -1.1642900705337524, "logps/chosen": -114.04878234863281, "logps/rejected": -191.70645141601562, "loss": 0.3311, "rewards/accuracies": 0.875, "rewards/chosen": 0.27160272002220154, "rewards/margins": 0.3185078799724579, "rewards/rejected": -0.04690515249967575, "step": 61 }, { "debug/policy_chosen_logits": -1.075607180595398, "debug/policy_chosen_logps": -113.74881744384766, "debug/policy_rejected_logits": -1.2998212575912476, "debug/policy_rejected_logps": -262.45599365234375, "debug/reference_chosen_logps": -137.3640594482422, "debug/reference_rejected_logps": -241.2454833984375, "epoch": 0.9538461538461539, "grad_norm": 33.971478319739994, "learning_rate": 1e-06, "logits/chosen": -1.075607180595398, "logits/rejected": -1.2998212575912476, "logps/chosen": -113.74881744384766, "logps/rejected": -262.45599365234375, "loss": 0.3887, "rewards/accuracies": 0.75, "rewards/chosen": 0.23615238070487976, "rewards/margins": 0.4482572674751282, "rewards/rejected": -0.21210487186908722, "step": 62 }, { "debug/policy_chosen_logits": -0.8238174915313721, "debug/policy_chosen_logps": -170.8466796875, "debug/policy_rejected_logits": -1.3161985874176025, "debug/policy_rejected_logps": -237.17462158203125, "debug/reference_chosen_logps": -183.77645874023438, "debug/reference_rejected_logps": -219.39682006835938, "epoch": 0.9692307692307692, "grad_norm": 32.654718278827374, "learning_rate": 1e-06, "logits/chosen": -0.8238174915313721, "logits/rejected": -1.3161985874176025, "logps/chosen": -170.8466796875, "logps/rejected": -237.17462158203125, "loss": 0.3458, "rewards/accuracies": 0.75, "rewards/chosen": 0.12929785251617432, "rewards/margins": 0.3070757985115051, "rewards/rejected": -0.1777779459953308, "step": 63 }, { "debug/policy_chosen_logits": -1.0263506174087524, "debug/policy_chosen_logps": -125.7145767211914, "debug/policy_rejected_logits": -1.2933566570281982, "debug/policy_rejected_logps": -257.61431884765625, "debug/reference_chosen_logps": -134.1392822265625, "debug/reference_rejected_logps": -232.3591766357422, "epoch": 0.9846153846153847, "grad_norm": 39.88494751069702, "learning_rate": 1e-06, "logits/chosen": -1.0263506174087524, "logits/rejected": -1.2933566570281982, "logps/chosen": -125.7145767211914, "logps/rejected": -257.61431884765625, "loss": 0.359, "rewards/accuracies": 0.75, "rewards/chosen": 0.08424711972475052, "rewards/margins": 0.3367987275123596, "rewards/rejected": -0.2525515854358673, "step": 64 }, { "debug/policy_chosen_logits": -0.9630184173583984, "debug/policy_chosen_logps": -143.5438995361328, "debug/policy_rejected_logits": -0.9999608397483826, "debug/policy_rejected_logps": -217.00332641601562, "debug/reference_chosen_logps": -152.22354125976562, "debug/reference_rejected_logps": -208.64981079101562, "epoch": 1.0, "grad_norm": 21.748058770965844, "learning_rate": 1e-06, "logits/chosen": -0.9630184173583984, "logits/rejected": -0.9999608397483826, "logps/chosen": -143.5438995361328, "logps/rejected": -217.00332641601562, "loss": 0.3368, "rewards/accuracies": 0.625, "rewards/chosen": 0.08679643273353577, "rewards/margins": 0.17033153772354126, "rewards/rejected": -0.08353511989116669, "step": 65 }, { "epoch": 1.0, "step": 65, "total_flos": 0.0, "train_loss": 0.3866597702870002, "train_runtime": 200.7231, "train_samples_per_second": 20.655, "train_steps_per_second": 0.324 } ], "logging_steps": 1, "max_steps": 65, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }