{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 391, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025575447570332483, "grad_norm": 42.05885932037307, "learning_rate": 2.5e-09, "logits/chosen": -4.623842239379883, "logits/rejected": -4.85917854309082, "logps/chosen": -239.31422424316406, "logps/rejected": -207.56365966796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02557544757033248, "grad_norm": 39.560773735648084, "learning_rate": 2.5e-08, "logits/chosen": -4.334544658660889, "logits/rejected": -4.644796848297119, "logps/chosen": -265.15618896484375, "logps/rejected": -215.6714630126953, "loss": 0.693, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": -0.0004928099224343896, "rewards/margins": -0.0008595392573624849, "rewards/rejected": 0.00036672933492809534, "step": 10 }, { "epoch": 0.05115089514066496, "grad_norm": 40.83271143256618, "learning_rate": 5e-08, "logits/chosen": -4.509532928466797, "logits/rejected": -4.744012832641602, "logps/chosen": -267.80267333984375, "logps/rejected": -216.80471801757812, "loss": 0.6919, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0013727399054914713, "rewards/margins": 0.0033264080993831158, "rewards/rejected": -0.0019536681938916445, "step": 20 }, { "epoch": 0.07672634271099744, "grad_norm": 43.48154475134036, "learning_rate": 7.5e-08, "logits/chosen": -4.5965423583984375, "logits/rejected": -4.777901649475098, "logps/chosen": -257.59088134765625, "logps/rejected": -215.49658203125, "loss": 0.6845, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.00941941887140274, "rewards/margins": 0.019057607278227806, "rewards/rejected": -0.00963818933814764, "step": 30 }, { "epoch": 0.10230179028132992, "grad_norm": 43.11247032025707, "learning_rate": 1e-07, "logits/chosen": -4.648722171783447, "logits/rejected": -4.745718002319336, "logps/chosen": -250.10897827148438, "logps/rejected": -223.86532592773438, "loss": 0.6588, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.03490210697054863, "rewards/margins": 0.07684428989887238, "rewards/rejected": -0.041942186653614044, "step": 40 }, { "epoch": 0.1278772378516624, "grad_norm": 47.11742069616159, "learning_rate": 9.979985922607475e-08, "logits/chosen": -4.593738555908203, "logits/rejected": -4.8337082862854, "logps/chosen": -267.30694580078125, "logps/rejected": -239.9588623046875, "loss": 0.6057, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.04662395641207695, "rewards/margins": 0.1874973475933075, "rewards/rejected": -0.14087337255477905, "step": 50 }, { "epoch": 0.1534526854219949, "grad_norm": 39.18274034042972, "learning_rate": 9.92010391574745e-08, "logits/chosen": -4.788964748382568, "logits/rejected": -4.883444786071777, "logps/chosen": -237.8981475830078, "logps/rejected": -257.84942626953125, "loss": 0.5174, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03224308043718338, "rewards/margins": 0.46052321791648865, "rewards/rejected": -0.4282800555229187, "step": 60 }, { "epoch": 0.17902813299232737, "grad_norm": 34.832880831116846, "learning_rate": 9.820833372667812e-08, "logits/chosen": -4.657534599304199, "logits/rejected": -4.817151069641113, "logps/chosen": -249.996337890625, "logps/rejected": -280.097412109375, "loss": 0.4614, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.031456105411052704, "rewards/margins": 0.6548057198524475, "rewards/rejected": -0.6233495473861694, "step": 70 }, { "epoch": 0.20460358056265984, "grad_norm": 36.47722570862778, "learning_rate": 9.682969016701356e-08, "logits/chosen": -4.626967430114746, "logits/rejected": -4.778214454650879, "logps/chosen": -250.9975128173828, "logps/rejected": -311.1219177246094, "loss": 0.3904, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.10222460329532623, "rewards/margins": 0.9102567434310913, "rewards/rejected": -0.8080320358276367, "step": 80 }, { "epoch": 0.23017902813299232, "grad_norm": 30.998854450156045, "learning_rate": 9.507614539004081e-08, "logits/chosen": -4.739785194396973, "logits/rejected": -4.909841060638428, "logps/chosen": -237.671875, "logps/rejected": -307.8204040527344, "loss": 0.3509, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.1354086697101593, "rewards/margins": 1.1853126287460327, "rewards/rejected": -1.0499038696289062, "step": 90 }, { "epoch": 0.2557544757033248, "grad_norm": 42.52785579314538, "learning_rate": 9.296173762811083e-08, "logits/chosen": -4.647661209106445, "logits/rejected": -4.924945831298828, "logps/chosen": -244.45303344726562, "logps/rejected": -355.6828918457031, "loss": 0.3413, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.12265179306268692, "rewards/margins": 1.4083904027938843, "rewards/rejected": -1.2857385873794556, "step": 100 }, { "epoch": 0.2557544757033248, "eval_logits/chosen": -4.821703910827637, "eval_logits/rejected": -5.045117378234863, "eval_logps/chosen": -444.5645751953125, "eval_logps/rejected": -575.9554443359375, "eval_loss": 0.7230384349822998, "eval_rewards/accuracies": 0.515625, "eval_rewards/chosen": -0.5409007668495178, "eval_rewards/margins": 0.03477693349123001, "eval_rewards/rejected": -0.575677752494812, "eval_runtime": 98.6304, "eval_samples_per_second": 20.278, "eval_steps_per_second": 0.324, "step": 100 }, { "epoch": 0.2813299232736573, "grad_norm": 42.40417010662429, "learning_rate": 9.050339404945832e-08, "logits/chosen": -4.8084492683410645, "logits/rejected": -5.027788162231445, "logps/chosen": -247.86376953125, "logps/rejected": -369.55267333984375, "loss": 0.3143, "rewards/accuracies": 0.90625, "rewards/chosen": 0.01321962010115385, "rewards/margins": 1.5418872833251953, "rewards/rejected": -1.528667688369751, "step": 110 }, { "epoch": 0.3069053708439898, "grad_norm": 31.11429497548564, "learning_rate": 8.77207952455395e-08, "logits/chosen": -4.781357765197754, "logits/rejected": -5.055319786071777, "logps/chosen": -271.8451843261719, "logps/rejected": -396.73046875, "loss": 0.3042, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.041443757712841034, "rewards/margins": 1.7226619720458984, "rewards/rejected": -1.6812183856964111, "step": 120 }, { "epoch": 0.33248081841432225, "grad_norm": 32.05773581279916, "learning_rate": 8.463621767547997e-08, "logits/chosen": -4.876931190490723, "logits/rejected": -5.202266693115234, "logps/chosen": -264.2982482910156, "logps/rejected": -409.0570983886719, "loss": 0.2914, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.058543670922517776, "rewards/margins": 1.8787403106689453, "rewards/rejected": -1.937284231185913, "step": 130 }, { "epoch": 0.35805626598465473, "grad_norm": 31.96087329942538, "learning_rate": 8.127435532896387e-08, "logits/chosen": -4.971903324127197, "logits/rejected": -5.277985095977783, "logps/chosen": -305.4132385253906, "logps/rejected": -457.46343994140625, "loss": 0.274, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.2121816873550415, "rewards/margins": 2.040717601776123, "rewards/rejected": -2.252899408340454, "step": 140 }, { "epoch": 0.3836317135549872, "grad_norm": 40.46461234858551, "learning_rate": 7.766212203526569e-08, "logits/chosen": -5.087113857269287, "logits/rejected": -5.368134498596191, "logps/chosen": -274.01080322265625, "logps/rejected": -457.38330078125, "loss": 0.2819, "rewards/accuracies": 0.875, "rewards/chosen": -0.2159254252910614, "rewards/margins": 2.166714906692505, "rewards/rejected": -2.3826401233673096, "step": 150 }, { "epoch": 0.4092071611253197, "grad_norm": 32.057320142788335, "learning_rate": 7.382843600106538e-08, "logits/chosen": -5.177260398864746, "logits/rejected": -5.416450023651123, "logps/chosen": -284.1901550292969, "logps/rejected": -474.3257751464844, "loss": 0.2436, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.3273366093635559, "rewards/margins": 2.2598299980163574, "rewards/rejected": -2.5871663093566895, "step": 160 }, { "epoch": 0.43478260869565216, "grad_norm": 33.151157821087715, "learning_rate": 6.980398830195784e-08, "logits/chosen": -5.109088897705078, "logits/rejected": -5.438628196716309, "logps/chosen": -296.1925964355469, "logps/rejected": -516.4288940429688, "loss": 0.2364, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3511837124824524, "rewards/margins": 2.6150753498077393, "rewards/rejected": -2.966259002685547, "step": 170 }, { "epoch": 0.46035805626598464, "grad_norm": 34.18806970089564, "learning_rate": 6.562099718102787e-08, "logits/chosen": -5.2773332595825195, "logits/rejected": -5.568037509918213, "logps/chosen": -284.951904296875, "logps/rejected": -486.5365295410156, "loss": 0.2628, "rewards/accuracies": 0.875, "rewards/chosen": -0.480882465839386, "rewards/margins": 2.4242804050445557, "rewards/rejected": -2.9051625728607178, "step": 180 }, { "epoch": 0.4859335038363171, "grad_norm": 33.03269272782741, "learning_rate": 6.131295012148612e-08, "logits/chosen": -5.19248104095459, "logits/rejected": -5.355208396911621, "logps/chosen": -311.060791015625, "logps/rejected": -542.6156005859375, "loss": 0.2517, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.5016793012619019, "rewards/margins": 2.5728163719177246, "rewards/rejected": -3.074495792388916, "step": 190 }, { "epoch": 0.5115089514066496, "grad_norm": 40.925552268276135, "learning_rate": 5.691433575823665e-08, "logits/chosen": -5.236765384674072, "logits/rejected": -5.465119361877441, "logps/chosen": -302.1981201171875, "logps/rejected": -515.2794799804688, "loss": 0.2653, "rewards/accuracies": 0.875, "rewards/chosen": -0.5005888342857361, "rewards/margins": 2.4987406730651855, "rewards/rejected": -2.9993293285369873, "step": 200 }, { "epoch": 0.5115089514066496, "eval_logits/chosen": -5.226232528686523, "eval_logits/rejected": -5.50424337387085, "eval_logps/chosen": -540.43896484375, "eval_logps/rejected": -679.8809814453125, "eval_loss": 0.7765124440193176, "eval_rewards/accuracies": 0.54296875, "eval_rewards/chosen": -1.4996453523635864, "eval_rewards/margins": 0.11528739333152771, "eval_rewards/rejected": -1.6149327754974365, "eval_runtime": 98.5941, "eval_samples_per_second": 20.285, "eval_steps_per_second": 0.325, "step": 200 }, { "epoch": 0.5370843989769821, "grad_norm": 26.659672604447973, "learning_rate": 5.2460367774593905e-08, "logits/chosen": -5.310137748718262, "logits/rejected": -5.583542823791504, "logps/chosen": -303.67047119140625, "logps/rejected": -573.1016845703125, "loss": 0.2296, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.37785404920578003, "rewards/margins": 3.0667028427124023, "rewards/rejected": -3.444556713104248, "step": 210 }, { "epoch": 0.5626598465473146, "grad_norm": 35.355172011912686, "learning_rate": 4.798670299452925e-08, "logits/chosen": -5.1389665603637695, "logits/rejected": -5.567061424255371, "logps/chosen": -304.0540466308594, "logps/rejected": -569.4851684570312, "loss": 0.245, "rewards/accuracies": 0.90625, "rewards/chosen": -0.41242700815200806, "rewards/margins": 3.0356929302215576, "rewards/rejected": -3.4481201171875, "step": 220 }, { "epoch": 0.5882352941176471, "grad_norm": 37.35765448344736, "learning_rate": 4.3529155927297226e-08, "logits/chosen": -5.210625648498535, "logits/rejected": -5.601117134094238, "logps/chosen": -323.33135986328125, "logps/rejected": -591.130126953125, "loss": 0.2477, "rewards/accuracies": 0.90625, "rewards/chosen": -0.623622715473175, "rewards/margins": 3.050567150115967, "rewards/rejected": -3.674190044403076, "step": 230 }, { "epoch": 0.6138107416879796, "grad_norm": 27.168387739658527, "learning_rate": 3.9123412049691636e-08, "logits/chosen": -5.26107120513916, "logits/rejected": -5.582613945007324, "logps/chosen": -341.65289306640625, "logps/rejected": -593.1688232421875, "loss": 0.2349, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.68829745054245, "rewards/margins": 3.034055233001709, "rewards/rejected": -3.7223525047302246, "step": 240 }, { "epoch": 0.639386189258312, "grad_norm": 34.59601076495169, "learning_rate": 3.480474212128766e-08, "logits/chosen": -5.441601753234863, "logits/rejected": -5.72822380065918, "logps/chosen": -329.5417175292969, "logps/rejected": -537.7394409179688, "loss": 0.2339, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8084025382995605, "rewards/margins": 2.4929001331329346, "rewards/rejected": -3.301302433013916, "step": 250 }, { "epoch": 0.6649616368286445, "grad_norm": 44.5395657806438, "learning_rate": 3.060771981975726e-08, "logits/chosen": -5.302738666534424, "logits/rejected": -5.622676372528076, "logps/chosen": -326.24041748046875, "logps/rejected": -637.6575927734375, "loss": 0.2325, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7219182848930359, "rewards/margins": 3.4571731090545654, "rewards/rejected": -4.179091453552246, "step": 260 }, { "epoch": 0.690537084398977, "grad_norm": 33.64914034772639, "learning_rate": 2.6565944956764818e-08, "logits/chosen": -5.4421281814575195, "logits/rejected": -5.695931911468506, "logps/chosen": -332.70892333984375, "logps/rejected": -598.5055541992188, "loss": 0.2433, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.7167563438415527, "rewards/margins": 3.118049144744873, "rewards/rejected": -3.834805727005005, "step": 270 }, { "epoch": 0.7161125319693095, "grad_norm": 38.28164920230575, "learning_rate": 2.2711774490274766e-08, "logits/chosen": -5.344332695007324, "logits/rejected": -5.591184616088867, "logps/chosen": -331.06939697265625, "logps/rejected": -640.959228515625, "loss": 0.2174, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.665625810623169, "rewards/margins": 3.3145720958709717, "rewards/rejected": -3.9801979064941406, "step": 280 }, { "epoch": 0.7416879795396419, "grad_norm": 42.555865291815444, "learning_rate": 1.9076063486687256e-08, "logits/chosen": -5.223475933074951, "logits/rejected": -5.618660926818848, "logps/chosen": -328.63055419921875, "logps/rejected": -579.0905151367188, "loss": 0.2228, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.5602216124534607, "rewards/margins": 3.074389696121216, "rewards/rejected": -3.634611129760742, "step": 290 }, { "epoch": 0.7672634271099744, "grad_norm": 40.820437800178965, "learning_rate": 1.5687918106563324e-08, "logits/chosen": -5.369271755218506, "logits/rejected": -5.632781028747559, "logps/chosen": -320.268798828125, "logps/rejected": -608.9943237304688, "loss": 0.2424, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.6686061024665833, "rewards/margins": 3.2616829872131348, "rewards/rejected": -3.9302895069122314, "step": 300 }, { "epoch": 0.7672634271099744, "eval_logits/chosen": -5.394677639007568, "eval_logits/rejected": -5.655616283416748, "eval_logps/chosen": -579.441650390625, "eval_logps/rejected": -718.423828125, "eval_loss": 0.8002758622169495, "eval_rewards/accuracies": 0.52734375, "eval_rewards/chosen": -1.889671802520752, "eval_rewards/margins": 0.11068924516439438, "eval_rewards/rejected": -2.000361442565918, "eval_runtime": 98.5861, "eval_samples_per_second": 20.287, "eval_steps_per_second": 0.325, "step": 300 }, { "epoch": 0.7928388746803069, "grad_norm": 33.64379879568246, "learning_rate": 1.257446259144494e-08, "logits/chosen": -5.246872425079346, "logits/rejected": -5.653367042541504, "logps/chosen": -315.7105407714844, "logps/rejected": -625.9619140625, "loss": 0.2323, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.5457090735435486, "rewards/margins": 3.491931200027466, "rewards/rejected": -4.03764009475708, "step": 310 }, { "epoch": 0.8184143222506394, "grad_norm": 35.35694379401523, "learning_rate": 9.760622117187234e-09, "logits/chosen": -5.381436824798584, "logits/rejected": -5.7473673820495605, "logps/chosen": -314.6531677246094, "logps/rejected": -594.841552734375, "loss": 0.2466, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.6948888897895813, "rewards/margins": 3.1890125274658203, "rewards/rejected": -3.8839008808135986, "step": 320 }, { "epoch": 0.8439897698209718, "grad_norm": 34.515465680243125, "learning_rate": 7.2689232521989885e-09, "logits/chosen": -5.308783531188965, "logits/rejected": -5.656357765197754, "logps/chosen": -347.4857482910156, "logps/rejected": -629.4615478515625, "loss": 0.2233, "rewards/accuracies": 0.875, "rewards/chosen": -0.7933691143989563, "rewards/margins": 3.1742498874664307, "rewards/rejected": -3.967618942260742, "step": 330 }, { "epoch": 0.8695652173913043, "grad_norm": 28.542655038843865, "learning_rate": 5.119313618049309e-09, "logits/chosen": -5.346091270446777, "logits/rejected": -5.756931781768799, "logps/chosen": -338.65509033203125, "logps/rejected": -575.4979858398438, "loss": 0.2174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.6450907588005066, "rewards/margins": 3.0420687198638916, "rewards/rejected": -3.687159299850464, "step": 340 }, { "epoch": 0.8951406649616368, "grad_norm": 33.42105425863571, "learning_rate": 3.3290021961708158e-09, "logits/chosen": -5.374421119689941, "logits/rejected": -5.536851406097412, "logps/chosen": -333.8661193847656, "logps/rejected": -595.2741088867188, "loss": 0.2467, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.7839492559432983, "rewards/margins": 2.891091823577881, "rewards/rejected": -3.6750411987304688, "step": 350 }, { "epoch": 0.9207161125319693, "grad_norm": 33.175441995042306, "learning_rate": 1.9123215591052013e-09, "logits/chosen": -5.3232526779174805, "logits/rejected": -5.559803485870361, "logps/chosen": -337.17694091796875, "logps/rejected": -596.7660522460938, "loss": 0.2397, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.7554206252098083, "rewards/margins": 2.937615156173706, "rewards/rejected": -3.693035840988159, "step": 360 }, { "epoch": 0.9462915601023018, "grad_norm": 33.50889046296721, "learning_rate": 8.806131292167618e-10, "logits/chosen": -5.363125801086426, "logits/rejected": -5.561426162719727, "logps/chosen": -327.09295654296875, "logps/rejected": -608.7786865234375, "loss": 0.238, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.7027177214622498, "rewards/margins": 3.01659893989563, "rewards/rejected": -3.719316005706787, "step": 370 }, { "epoch": 0.9718670076726342, "grad_norm": 42.30140132740828, "learning_rate": 2.4213638345040867e-10, "logits/chosen": -5.489308834075928, "logits/rejected": -5.787456512451172, "logps/chosen": -332.35858154296875, "logps/rejected": -607.3480224609375, "loss": 0.2341, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.7027586698532104, "rewards/margins": 3.165475368499756, "rewards/rejected": -3.8682339191436768, "step": 380 }, { "epoch": 0.9974424552429667, "grad_norm": 37.88179259111206, "learning_rate": 2.0027310073833516e-12, "logits/chosen": -5.485334873199463, "logits/rejected": -5.764852046966553, "logps/chosen": -331.56610107421875, "logps/rejected": -614.2138061523438, "loss": 0.2223, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7189357876777649, "rewards/margins": 3.2180511951446533, "rewards/rejected": -3.9369864463806152, "step": 390 }, { "epoch": 1.0, "step": 391, "total_flos": 0.0, "train_loss": 0.3220548828315857, "train_runtime": 6253.066, "train_samples_per_second": 7.996, "train_steps_per_second": 0.063 } ], "logging_steps": 10, "max_steps": 391, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }