{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 100, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.400390625, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.2547454833984375, "logits/rejected": -2.401865005493164, "logps/chosen": -53.759212493896484, "logps/rejected": -48.83185958862305, "loss": 0.6931, "pred_label": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "use_label": 0.0 }, { "epoch": 0.02, "grad_norm": 0.4609375, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.2421462535858154, "logits/rejected": -2.2770614624023438, "logps/chosen": -51.98179626464844, "logps/rejected": -64.9604263305664, "loss": 0.6929, "pred_label": 0.0, "rewards/accuracies": 0.2222222238779068, "rewards/chosen": 0.001975727966055274, "rewards/margins": 0.00047667179023846984, "rewards/rejected": 0.001499056350439787, "step": 10, "use_label": 0.0 }, { "epoch": 0.04, "grad_norm": 0.39453125, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -2.2520272731781006, "logits/rejected": -2.255510091781616, "logps/chosen": -62.492515563964844, "logps/rejected": -72.63607788085938, "loss": 0.6919, "pred_label": 0.0, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": 0.01601376011967659, "rewards/margins": 0.0011284304782748222, "rewards/rejected": 0.014885328710079193, "step": 20, "use_label": 0.0 }, { "epoch": 0.06, "grad_norm": 0.5078125, "learning_rate": 3.125e-06, "logits/chosen": -2.3422012329101562, "logits/rejected": -2.3548905849456787, "logps/chosen": -79.14694213867188, "logps/rejected": -98.82722473144531, "loss": 0.6898, "pred_label": 0.0, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": 0.030949687585234642, "rewards/margins": 0.0029636542312800884, "rewards/rejected": 0.027986034750938416, "step": 30, "use_label": 0.0 }, { "epoch": 0.08, "grad_norm": 0.515625, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.322833776473999, "logits/rejected": -2.3010501861572266, "logps/chosen": -82.85880279541016, "logps/rejected": -82.40392303466797, "loss": 0.6866, "pred_label": 0.0, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": 0.033333443105220795, "rewards/margins": 0.011918319389224052, "rewards/rejected": 0.021415119990706444, "step": 40, "use_label": 0.0 }, { "epoch": 0.1, "grad_norm": 0.67578125, "learning_rate": 4.999731868769027e-06, "logits/chosen": -2.241189956665039, "logits/rejected": -2.263849973678589, "logps/chosen": -67.93062591552734, "logps/rejected": -81.85546875, "loss": 0.6805, "pred_label": 0.0, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.009002490900456905, "rewards/margins": 0.03016103245317936, "rewards/rejected": -0.02115854248404503, "step": 50, "use_label": 0.0 }, { "epoch": 0.13, "grad_norm": 1.09375, "learning_rate": 4.9903533134293035e-06, "logits/chosen": -2.218756914138794, "logits/rejected": -2.1594481468200684, "logps/chosen": -62.0407600402832, "logps/rejected": -71.9369888305664, "loss": 0.6748, "pred_label": 0.0, "rewards/accuracies": 0.3062500059604645, "rewards/chosen": -0.0231451578438282, "rewards/margins": 0.04653460532426834, "rewards/rejected": -0.06967976689338684, "step": 60, "use_label": 0.0 }, { "epoch": 0.15, "grad_norm": 0.8984375, "learning_rate": 4.967625656594782e-06, "logits/chosen": -2.08909273147583, "logits/rejected": -2.088801383972168, "logps/chosen": -68.09326171875, "logps/rejected": -81.9454116821289, "loss": 0.6684, "pred_label": 0.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.12382155656814575, "rewards/margins": 0.03761869668960571, "rewards/rejected": -0.16144026815891266, "step": 70, "use_label": 0.0 }, { "epoch": 0.17, "grad_norm": 1.15625, "learning_rate": 4.93167072587771e-06, "logits/chosen": -2.20400071144104, "logits/rejected": -2.1452622413635254, "logps/chosen": -55.867881774902344, "logps/rejected": -70.91771697998047, "loss": 0.6588, "pred_label": 0.0, "rewards/accuracies": 0.26875001192092896, "rewards/chosen": -0.0733698159456253, "rewards/margins": 0.10403277724981308, "rewards/rejected": -0.17740261554718018, "step": 80, "use_label": 0.0 }, { "epoch": 0.19, "grad_norm": 1.0546875, "learning_rate": 4.882681251368549e-06, "logits/chosen": -1.991231918334961, "logits/rejected": -1.9964717626571655, "logps/chosen": -72.28443908691406, "logps/rejected": -90.79218292236328, "loss": 0.6587, "pred_label": 0.0, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.13902384042739868, "rewards/margins": 0.08125626295804977, "rewards/rejected": -0.22028008103370667, "step": 90, "use_label": 0.0 }, { "epoch": 0.21, "grad_norm": 2.359375, "learning_rate": 4.8209198325401815e-06, "logits/chosen": -1.9231764078140259, "logits/rejected": -1.9043807983398438, "logps/chosen": -103.5636978149414, "logps/rejected": -96.08602142333984, "loss": 0.6551, "pred_label": 0.0, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.2353379726409912, "rewards/margins": 0.08685441315174103, "rewards/rejected": -0.32219237089157104, "step": 100, "use_label": 0.0 }, { "epoch": 0.21, "eval_logits/chosen": -1.762041687965393, "eval_logits/rejected": -1.7460479736328125, "eval_logps/chosen": -87.55253601074219, "eval_logps/rejected": -114.47212219238281, "eval_loss": 0.652633547782898, "eval_pred_label": 0.0, "eval_rewards/accuracies": 0.3359375, "eval_rewards/chosen": -0.23640292882919312, "eval_rewards/margins": 0.136388897895813, "eval_rewards/rejected": -0.3727918267250061, "eval_runtime": 125.4491, "eval_samples_per_second": 15.943, "eval_steps_per_second": 0.255, "eval_use_label": 0.0, "step": 100 }, { "epoch": 0.23, "grad_norm": 1.59375, "learning_rate": 4.746717530629565e-06, "logits/chosen": -1.7847106456756592, "logits/rejected": -1.7590484619140625, "logps/chosen": -85.73925018310547, "logps/rejected": -106.20509338378906, "loss": 0.6557, "pred_label": 0.0, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.14638465642929077, "rewards/margins": 0.12975916266441345, "rewards/rejected": -0.2761438190937042, "step": 110, "use_label": 0.0 }, { "epoch": 0.25, "grad_norm": 1.828125, "learning_rate": 4.660472094042121e-06, "logits/chosen": -1.1902318000793457, "logits/rejected": -1.0542975664138794, "logps/chosen": -108.4779052734375, "logps/rejected": -127.95109558105469, "loss": 0.6493, "pred_label": 0.0, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.38532325625419617, "rewards/margins": 0.1649974286556244, "rewards/rejected": -0.5503206849098206, "step": 120, "use_label": 0.0 }, { "epoch": 0.27, "grad_norm": 1.9375, "learning_rate": 4.5626458262912745e-06, "logits/chosen": -0.818010687828064, "logits/rejected": -0.7847374081611633, "logps/chosen": -109.61775207519531, "logps/rejected": -133.42086791992188, "loss": 0.6524, "pred_label": 0.0, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.43839359283447266, "rewards/margins": 0.16735044121742249, "rewards/rejected": -0.6057440638542175, "step": 130, "use_label": 0.0 }, { "epoch": 0.29, "grad_norm": 1.71875, "learning_rate": 4.453763107901676e-06, "logits/chosen": -0.7395650148391724, "logits/rejected": -0.8444339036941528, "logps/chosen": -116.97528076171875, "logps/rejected": -130.2399139404297, "loss": 0.6381, "pred_label": 0.0, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.3622770607471466, "rewards/margins": 0.1490650475025177, "rewards/rejected": -0.5113420486450195, "step": 140, "use_label": 0.0 }, { "epoch": 0.31, "grad_norm": 2.125, "learning_rate": 4.33440758555951e-06, "logits/chosen": -0.6497868299484253, "logits/rejected": -0.6378159523010254, "logps/chosen": -89.60552978515625, "logps/rejected": -115.42192077636719, "loss": 0.6379, "pred_label": 0.0, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.2445882111787796, "rewards/margins": 0.23124215006828308, "rewards/rejected": -0.4758303761482239, "step": 150, "use_label": 0.0 }, { "epoch": 0.33, "grad_norm": 2.15625, "learning_rate": 4.205219043576955e-06, "logits/chosen": -0.3159053921699524, "logits/rejected": -0.33064812421798706, "logps/chosen": -99.68696594238281, "logps/rejected": -129.45729064941406, "loss": 0.6317, "pred_label": 0.0, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": -0.35356926918029785, "rewards/margins": 0.16687795519828796, "rewards/rejected": -0.5204472541809082, "step": 160, "use_label": 0.0 }, { "epoch": 0.36, "grad_norm": 2.4375, "learning_rate": 4.066889974440757e-06, "logits/chosen": 0.14531800150871277, "logits/rejected": 0.18166163563728333, "logps/chosen": -95.45491027832031, "logps/rejected": -125.1463623046875, "loss": 0.6291, "pred_label": 0.0, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.39946848154067993, "rewards/margins": 0.20978550612926483, "rewards/rejected": -0.609254002571106, "step": 170, "use_label": 0.0 }, { "epoch": 0.38, "grad_norm": 2.453125, "learning_rate": 3.92016186682789e-06, "logits/chosen": -0.3282355070114136, "logits/rejected": -0.21966704726219177, "logps/chosen": -108.00712585449219, "logps/rejected": -128.67587280273438, "loss": 0.649, "pred_label": 0.0, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.4521949887275696, "rewards/margins": 0.27172034978866577, "rewards/rejected": -0.7239152789115906, "step": 180, "use_label": 0.0 }, { "epoch": 0.4, "grad_norm": 1.84375, "learning_rate": 3.7658212309857576e-06, "logits/chosen": -0.889633297920227, "logits/rejected": -0.6851574778556824, "logps/chosen": -91.25111389160156, "logps/rejected": -118.9649887084961, "loss": 0.6461, "pred_label": 0.0, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.32139474153518677, "rewards/margins": 0.22424864768981934, "rewards/rejected": -0.5456433892250061, "step": 190, "use_label": 0.0 }, { "epoch": 0.42, "grad_norm": 1.9453125, "learning_rate": 3.604695382782159e-06, "logits/chosen": -0.8204952478408813, "logits/rejected": -0.7186430096626282, "logps/chosen": -112.41142272949219, "logps/rejected": -120.7835693359375, "loss": 0.6376, "pred_label": 0.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.30735117197036743, "rewards/margins": 0.169038325548172, "rewards/rejected": -0.47638946771621704, "step": 200, "use_label": 0.0 }, { "epoch": 0.42, "eval_logits/chosen": -0.023804781958460808, "eval_logits/rejected": 0.04317883029580116, "eval_logps/chosen": -97.96138000488281, "eval_logps/rejected": -137.9141845703125, "eval_loss": 0.6288520693778992, "eval_pred_label": 0.0, "eval_rewards/accuracies": 0.3671875, "eval_rewards/chosen": -0.34049129486083984, "eval_rewards/margins": 0.26672109961509705, "eval_rewards/rejected": -0.6072123646736145, "eval_runtime": 125.433, "eval_samples_per_second": 15.945, "eval_steps_per_second": 0.255, "eval_use_label": 0.0, "step": 200 }, { "epoch": 0.44, "grad_norm": 2.265625, "learning_rate": 3.437648009023905e-06, "logits/chosen": -0.05805685371160507, "logits/rejected": -0.06056814268231392, "logps/chosen": -88.78871154785156, "logps/rejected": -124.3318862915039, "loss": 0.6218, "pred_label": 0.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.3281395435333252, "rewards/margins": 0.28538644313812256, "rewards/rejected": -0.613525927066803, "step": 210, "use_label": 0.0 }, { "epoch": 0.46, "grad_norm": 2.21875, "learning_rate": 3.265574537815398e-06, "logits/chosen": -0.1400775909423828, "logits/rejected": -0.005620801355689764, "logps/chosen": -133.7158660888672, "logps/rejected": -136.84619140625, "loss": 0.627, "pred_label": 0.0, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.5408719778060913, "rewards/margins": 0.16390959918498993, "rewards/rejected": -0.7047815918922424, "step": 220, "use_label": 0.0 }, { "epoch": 0.48, "grad_norm": 1.8515625, "learning_rate": 3.089397338773569e-06, "logits/chosen": 0.16266627609729767, "logits/rejected": 0.2626825273036957, "logps/chosen": -93.3644027709961, "logps/rejected": -119.67996978759766, "loss": 0.6261, "pred_label": 0.0, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.28929832577705383, "rewards/margins": 0.27991363406181335, "rewards/rejected": -0.5692119598388672, "step": 230, "use_label": 0.0 }, { "epoch": 0.5, "grad_norm": 1.8984375, "learning_rate": 2.9100607788275547e-06, "logits/chosen": 0.854693591594696, "logits/rejected": 0.7261193990707397, "logps/chosen": -99.00528717041016, "logps/rejected": -135.73580932617188, "loss": 0.6295, "pred_label": 0.0, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.2997274696826935, "rewards/margins": 0.3153937757015228, "rewards/rejected": -0.6151211857795715, "step": 240, "use_label": 0.0 }, { "epoch": 0.52, "grad_norm": 2.03125, "learning_rate": 2.72852616010567e-06, "logits/chosen": 0.6816203594207764, "logits/rejected": 0.7033491134643555, "logps/chosen": -119.7255859375, "logps/rejected": -144.8857421875, "loss": 0.6376, "pred_label": 0.0, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.4632648825645447, "rewards/margins": 0.2932681143283844, "rewards/rejected": -0.7565330266952515, "step": 250, "use_label": 0.0 }, { "epoch": 0.54, "grad_norm": 1.8984375, "learning_rate": 2.5457665670441937e-06, "logits/chosen": 0.5938165187835693, "logits/rejected": 0.5592354536056519, "logps/chosen": -110.32804870605469, "logps/rejected": -146.76275634765625, "loss": 0.6162, "pred_label": 0.0, "rewards/accuracies": 0.34375, "rewards/chosen": -0.44222426414489746, "rewards/margins": 0.2809238135814667, "rewards/rejected": -0.7231480479240417, "step": 260, "use_label": 0.0 }, { "epoch": 0.57, "grad_norm": 2.90625, "learning_rate": 2.3627616503391813e-06, "logits/chosen": 0.6390979290008545, "logits/rejected": 0.5789315700531006, "logps/chosen": -123.83528137207031, "logps/rejected": -144.61489868164062, "loss": 0.6162, "pred_label": 0.0, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.5091949701309204, "rewards/margins": 0.24320097267627716, "rewards/rejected": -0.7523959279060364, "step": 270, "use_label": 0.0 }, { "epoch": 0.59, "grad_norm": 2.34375, "learning_rate": 2.1804923757009885e-06, "logits/chosen": 0.8771865963935852, "logits/rejected": 1.0158352851867676, "logps/chosen": -118.5296859741211, "logps/rejected": -138.31729125976562, "loss": 0.6357, "pred_label": 0.0, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.5302416086196899, "rewards/margins": 0.2237352430820465, "rewards/rejected": -0.7539768218994141, "step": 280, "use_label": 0.0 }, { "epoch": 0.61, "grad_norm": 2.59375, "learning_rate": 1.9999357655598894e-06, "logits/chosen": 0.44083184003829956, "logits/rejected": 0.41123947501182556, "logps/chosen": -112.27372741699219, "logps/rejected": -146.95498657226562, "loss": 0.6228, "pred_label": 0.0, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.4572528004646301, "rewards/margins": 0.24868395924568176, "rewards/rejected": -0.7059367299079895, "step": 290, "use_label": 0.0 }, { "epoch": 0.63, "grad_norm": 2.34375, "learning_rate": 1.8220596619089576e-06, "logits/chosen": 0.6273639798164368, "logits/rejected": 0.5140804052352905, "logps/chosen": -123.02046966552734, "logps/rejected": -168.80987548828125, "loss": 0.6196, "pred_label": 0.0, "rewards/accuracies": 0.40625, "rewards/chosen": -0.4542613625526428, "rewards/margins": 0.2926333546638489, "rewards/rejected": -0.7468947172164917, "step": 300, "use_label": 0.0 }, { "epoch": 0.63, "eval_logits/chosen": 1.0944873094558716, "eval_logits/rejected": 1.1831356287002563, "eval_logps/chosen": -102.62176513671875, "eval_logps/rejected": -150.12503051757812, "eval_loss": 0.618873655796051, "eval_pred_label": 0.0, "eval_rewards/accuracies": 0.375, "eval_rewards/chosen": -0.3870951533317566, "eval_rewards/margins": 0.34222573041915894, "eval_rewards/rejected": -0.7293209433555603, "eval_runtime": 125.4362, "eval_samples_per_second": 15.944, "eval_steps_per_second": 0.255, "eval_use_label": 0.0, "step": 300 }, { "epoch": 0.65, "grad_norm": 1.8515625, "learning_rate": 1.647817538357072e-06, "logits/chosen": 0.8131985664367676, "logits/rejected": 0.8752232789993286, "logps/chosen": -91.52378845214844, "logps/rejected": -139.95840454101562, "loss": 0.5999, "pred_label": 0.0, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.3592718541622162, "rewards/margins": 0.3578081727027893, "rewards/rejected": -0.7170799970626831, "step": 310, "use_label": 0.0 }, { "epoch": 0.67, "grad_norm": 2.40625, "learning_rate": 1.4781433892011132e-06, "logits/chosen": 0.9751952886581421, "logits/rejected": 1.1630818843841553, "logps/chosen": -135.82566833496094, "logps/rejected": -168.11805725097656, "loss": 0.6109, "pred_label": 0.0, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.6275521516799927, "rewards/margins": 0.3816707730293274, "rewards/rejected": -1.0092228651046753, "step": 320, "use_label": 0.0 }, { "epoch": 0.69, "grad_norm": 1.984375, "learning_rate": 1.3139467229135999e-06, "logits/chosen": 1.3293979167938232, "logits/rejected": 1.3260401487350464, "logps/chosen": -135.96664428710938, "logps/rejected": -166.52359008789062, "loss": 0.6295, "pred_label": 0.0, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.6585850715637207, "rewards/margins": 0.3205706775188446, "rewards/rejected": -0.9791557192802429, "step": 330, "use_label": 0.0 }, { "epoch": 0.71, "grad_norm": 2.09375, "learning_rate": 1.1561076868822756e-06, "logits/chosen": 0.7383319139480591, "logits/rejected": 0.6407849192619324, "logps/chosen": -150.60504150390625, "logps/rejected": -166.74940490722656, "loss": 0.6247, "pred_label": 0.0, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.658658504486084, "rewards/margins": 0.24373307824134827, "rewards/rejected": -0.9023915529251099, "step": 340, "use_label": 0.0 }, { "epoch": 0.73, "grad_norm": 2.21875, "learning_rate": 1.0054723495346484e-06, "logits/chosen": 0.6359546184539795, "logits/rejected": 0.7167641520500183, "logps/chosen": -163.8385772705078, "logps/rejected": -195.6297607421875, "loss": 0.6138, "pred_label": 0.0, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.7442194819450378, "rewards/margins": 0.3593973219394684, "rewards/rejected": -1.103616714477539, "step": 350, "use_label": 0.0 }, { "epoch": 0.75, "grad_norm": 1.859375, "learning_rate": 8.628481651367876e-07, "logits/chosen": 0.7298086881637573, "logits/rejected": 0.8517257571220398, "logps/chosen": -119.41548156738281, "logps/rejected": -165.3460235595703, "loss": 0.6137, "pred_label": 0.0, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.5577787160873413, "rewards/margins": 0.37339919805526733, "rewards/rejected": -0.9311779141426086, "step": 360, "use_label": 0.0 }, { "epoch": 0.77, "grad_norm": 2.421875, "learning_rate": 7.289996455765749e-07, "logits/chosen": 0.8383787274360657, "logits/rejected": 0.9305205345153809, "logps/chosen": -111.84449768066406, "logps/rejected": -153.93136596679688, "loss": 0.6125, "pred_label": 0.0, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.46409696340560913, "rewards/margins": 0.39606258273124695, "rewards/rejected": -0.8601595759391785, "step": 370, "use_label": 0.0 }, { "epoch": 0.8, "grad_norm": 1.8984375, "learning_rate": 6.046442623320145e-07, "logits/chosen": 0.5329448580741882, "logits/rejected": 0.513522744178772, "logps/chosen": -116.62841796875, "logps/rejected": -165.17893981933594, "loss": 0.6191, "pred_label": 0.0, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.5079302787780762, "rewards/margins": 0.2802185118198395, "rewards/rejected": -0.7881487607955933, "step": 380, "use_label": 0.0 }, { "epoch": 0.82, "grad_norm": 2.4375, "learning_rate": 4.904486005914027e-07, "logits/chosen": 0.8266662359237671, "logits/rejected": 0.5234752893447876, "logps/chosen": -159.83407592773438, "logps/rejected": -186.96768188476562, "loss": 0.6085, "pred_label": 0.0, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.6701575517654419, "rewards/margins": 0.36982032656669617, "rewards/rejected": -1.039977788925171, "step": 390, "use_label": 0.0 }, { "epoch": 0.84, "grad_norm": 2.46875, "learning_rate": 3.8702478614051353e-07, "logits/chosen": 0.511390745639801, "logits/rejected": 0.6720080971717834, "logps/chosen": -116.7987060546875, "logps/rejected": -141.3931884765625, "loss": 0.6139, "pred_label": 0.0, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.4430771768093109, "rewards/margins": 0.3362268805503845, "rewards/rejected": -0.779304027557373, "step": 400, "use_label": 0.0 }, { "epoch": 0.84, "eval_logits/chosen": 1.4532994031906128, "eval_logits/rejected": 1.5453113317489624, "eval_logps/chosen": -112.56050109863281, "eval_logps/rejected": -162.19764709472656, "eval_loss": 0.6157013177871704, "eval_pred_label": 0.0, "eval_rewards/accuracies": 0.37109375, "eval_rewards/chosen": -0.4864824414253235, "eval_rewards/margins": 0.36356455087661743, "eval_rewards/rejected": -0.8500469923019409, "eval_runtime": 125.4203, "eval_samples_per_second": 15.946, "eval_steps_per_second": 0.255, "eval_use_label": 0.0, "step": 400 }, { "epoch": 0.86, "grad_norm": 2.203125, "learning_rate": 2.9492720416985004e-07, "logits/chosen": 0.8359997868537903, "logits/rejected": 0.8144146800041199, "logps/chosen": -110.30177307128906, "logps/rejected": -143.6800079345703, "loss": 0.6222, "pred_label": 0.0, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.4887877404689789, "rewards/margins": 0.3508199453353882, "rewards/rejected": -0.8396075963973999, "step": 410, "use_label": 0.0 }, { "epoch": 0.88, "grad_norm": 1.984375, "learning_rate": 2.1464952759020857e-07, "logits/chosen": 1.027252435684204, "logits/rejected": 0.9827619791030884, "logps/chosen": -106.49784851074219, "logps/rejected": -116.97566223144531, "loss": 0.6216, "pred_label": 0.0, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.4555872976779938, "rewards/margins": 0.20033884048461914, "rewards/rejected": -0.6559261083602905, "step": 420, "use_label": 0.0 }, { "epoch": 0.9, "grad_norm": 1.96875, "learning_rate": 1.4662207078575685e-07, "logits/chosen": 0.9206047058105469, "logits/rejected": 0.8673297166824341, "logps/chosen": -151.376220703125, "logps/rejected": -178.04725646972656, "loss": 0.5986, "pred_label": 0.0, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.5210937261581421, "rewards/margins": 0.46580758690834045, "rewards/rejected": -0.9869012832641602, "step": 430, "use_label": 0.0 }, { "epoch": 0.92, "grad_norm": 2.125, "learning_rate": 9.120948298936422e-08, "logits/chosen": 0.9004503488540649, "logits/rejected": 1.0573413372039795, "logps/chosen": -119.21500396728516, "logps/rejected": -165.19241333007812, "loss": 0.6064, "pred_label": 0.0, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.5231102705001831, "rewards/margins": 0.37818416953086853, "rewards/rejected": -0.9012944102287292, "step": 440, "use_label": 0.0 }, { "epoch": 0.94, "grad_norm": 2.46875, "learning_rate": 4.870879364444109e-08, "logits/chosen": 1.300728440284729, "logits/rejected": 1.0580918788909912, "logps/chosen": -129.29281616210938, "logps/rejected": -178.3690948486328, "loss": 0.6111, "pred_label": 0.0, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.570349931716919, "rewards/margins": 0.3304445147514343, "rewards/rejected": -0.9007943868637085, "step": 450, "use_label": 0.0 }, { "epoch": 0.96, "grad_norm": 1.8359375, "learning_rate": 1.93478202307823e-08, "logits/chosen": 1.1906068325042725, "logits/rejected": 1.2149587869644165, "logps/chosen": -83.74864196777344, "logps/rejected": -130.91348266601562, "loss": 0.6154, "pred_label": 0.0, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.3762877583503723, "rewards/margins": 0.2993956208229065, "rewards/rejected": -0.6756833791732788, "step": 460, "use_label": 0.0 }, { "epoch": 0.98, "grad_norm": 2.375, "learning_rate": 3.283947088983663e-09, "logits/chosen": 1.1844379901885986, "logits/rejected": 0.9474547505378723, "logps/chosen": -113.1079330444336, "logps/rejected": -141.49147033691406, "loss": 0.6213, "pred_label": 0.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.4577876627445221, "rewards/margins": 0.26655709743499756, "rewards/rejected": -0.7243447303771973, "step": 470, "use_label": 0.0 }, { "epoch": 1.0, "step": 477, "total_flos": 0.0, "train_loss": 0.6357159084743924, "train_runtime": 9601.7268, "train_samples_per_second": 6.367, "train_steps_per_second": 0.05 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }