{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08, "eval_steps": 500, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 492.29167556762695, "epoch": 0.00017777777777777779, "grad_norm": 0.08746972070242835, "kl": 0.0, "learning_rate": 7.142857142857142e-08, "loss": 0.0, "reward": 0.02083333395421505, "reward_std": 0.05103103443980217, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 0.0, "step": 1 }, { "completion_length": 488.62500762939453, "epoch": 0.00035555555555555557, "grad_norm": 0.18281958866590367, "kl": 0.0, "learning_rate": 1.4285714285714285e-07, "loss": 0.0, "reward": 0.06250000186264515, "reward_std": 0.1530931070446968, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.0, "step": 2 }, { "completion_length": 481.8958396911621, "epoch": 0.0005333333333333334, "grad_norm": 0.18273061607712965, "kl": 0.00029015541076660156, "learning_rate": 2.1428571428571426e-07, "loss": 0.0, "reward": 0.0833333358168602, "reward_std": 0.20412413775920868, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.02083333395421505, "step": 3 }, { "completion_length": 477.2708396911621, "epoch": 0.0007111111111111111, "grad_norm": 0.1403227179482258, "kl": 0.00024211406707763672, "learning_rate": 2.857142857142857e-07, "loss": 0.0, "reward": 0.0416666679084301, "reward_std": 0.10206207260489464, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.0, "step": 4 }, { "completion_length": 485.31250381469727, "epoch": 0.0008888888888888889, "grad_norm": 0.13329145692597189, "kl": 0.0002715587615966797, "learning_rate": 3.5714285714285716e-07, "loss": 0.0, "reward": 0.0833333358168602, "reward_std": 0.16661180555820465, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.02083333395421505, "step": 5 }, { "completion_length": 485.3958396911621, "epoch": 0.0010666666666666667, "grad_norm": 0.1366397496686443, "kl": 0.0002875328063964844, "learning_rate": 4.285714285714285e-07, "loss": 0.0, "reward": 0.0416666679084301, "reward_std": 0.10206207260489464, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.0, "step": 6 }, { "completion_length": 489.83333587646484, "epoch": 0.0012444444444444445, "grad_norm": 0.0015282362483640434, "kl": 0.0002818107604980469, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 7 }, { "completion_length": 478.1041717529297, "epoch": 0.0014222222222222223, "grad_norm": 0.5082362557559295, "kl": 0.0039789676666259766, "learning_rate": 5.714285714285714e-07, "loss": 0.0002, "reward": 0.02083333395421505, "reward_std": 0.05103103816509247, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 0.0, "step": 8 }, { "completion_length": 498.25000381469727, "epoch": 0.0016, "grad_norm": 0.1284152446682543, "kl": 0.00029969215393066406, "learning_rate": 6.428571428571429e-07, "loss": 0.0, "reward": 0.0416666679084301, "reward_std": 0.10206207260489464, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 0.02083333395421505, "step": 9 }, { "completion_length": 493.75000762939453, "epoch": 0.0017777777777777779, "grad_norm": 0.003296653133727949, "kl": 0.0004258155822753906, "learning_rate": 7.142857142857143e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 10 }, { "completion_length": 479.81250381469727, "epoch": 0.0019555555555555554, "grad_norm": 0.15068988372298686, "kl": 0.0005173683166503906, "learning_rate": 7.857142857142856e-07, "loss": 0.0, "reward": 0.06250000186264515, "reward_std": 0.11558076366782188, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.0, "step": 11 }, { "completion_length": 472.79167556762695, "epoch": 0.0021333333333333334, "grad_norm": 0.004443001493268414, "kl": 0.0008454322814941406, "learning_rate": 8.57142857142857e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 12 }, { "completion_length": 477.2708435058594, "epoch": 0.002311111111111111, "grad_norm": 0.17120574292118212, "kl": 0.0010666847229003906, "learning_rate": 9.285714285714285e-07, "loss": 0.0, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.0, "step": 13 }, { "completion_length": 487.87500381469727, "epoch": 0.002488888888888889, "grad_norm": 0.08858314482153737, "kl": 0.0020389556884765625, "learning_rate": 1e-06, "loss": 0.0001, "reward": 0.02083333395421505, "reward_std": 0.05103103443980217, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 0.0, "step": 14 }, { "completion_length": 463.54167556762695, "epoch": 0.0026666666666666666, "grad_norm": 0.21660240108333578, "kl": 0.0032138824462890625, "learning_rate": 9.999870202927739e-07, "loss": 0.0001, "reward": 0.0833333358168602, "reward_std": 0.16661180183291435, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.0, "step": 15 }, { "completion_length": 442.29167556762695, "epoch": 0.0028444444444444446, "grad_norm": 0.23716444884739032, "kl": 0.00571441650390625, "learning_rate": 9.999480818449865e-07, "loss": 0.0002, "reward": 0.1250000037252903, "reward_std": 0.2686738707125187, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.02083333395421505, "step": 16 }, { "completion_length": 471.8958435058594, "epoch": 0.003022222222222222, "grad_norm": 0.17669793982814788, "kl": 0.00925445556640625, "learning_rate": 9.998831866782768e-07, "loss": 0.0004, "reward": 0.06250000186264515, "reward_std": 0.11558076366782188, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.02083333395421505, "step": 17 }, { "completion_length": 480.1041793823242, "epoch": 0.0032, "grad_norm": 0.14804922191678244, "kl": 0.01300811767578125, "learning_rate": 9.997923381619255e-07, "loss": 0.0005, "reward": 0.06250000186264515, "reward_std": 0.1530931033194065, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.02083333395421505, "step": 18 }, { "completion_length": 450.3125114440918, "epoch": 0.0033777777777777777, "grad_norm": 0.20021449335204797, "kl": 0.0184478759765625, "learning_rate": 9.996755410126814e-07, "loss": 0.0007, "reward": 0.0833333358168602, "reward_std": 0.16661180183291435, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.0, "step": 19 }, { "completion_length": 444.7708435058594, "epoch": 0.0035555555555555557, "grad_norm": 0.1748648048131685, "kl": 0.0309295654296875, "learning_rate": 9.995328012945157e-07, "loss": 0.0012, "reward": 0.08333333395421505, "reward_std": 0.15561354532837868, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.02083333395421505, "step": 20 }, { "completion_length": 421.8541793823242, "epoch": 0.0037333333333333333, "grad_norm": 0.5042804386943773, "kl": 0.04931640625, "learning_rate": 9.993641264183072e-07, "loss": 0.002, "reward": 0.18750000558793545, "reward_std": 0.3842546343803406, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.06250000186264515, "step": 21 }, { "completion_length": 395.2291793823242, "epoch": 0.003911111111111111, "grad_norm": 0.22336033886380158, "kl": 0.08929443359375, "learning_rate": 9.991695251414583e-07, "loss": 0.0036, "reward": 0.06250000186264515, "reward_std": 0.1530931070446968, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.02083333395421505, "step": 22 }, { "completion_length": 396.81250762939453, "epoch": 0.004088888888888889, "grad_norm": 0.2743029521713877, "kl": 0.12481689453125, "learning_rate": 9.989490075674389e-07, "loss": 0.005, "reward": 0.0833333358168602, "reward_std": 0.16661179810762405, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.02083333395421505, "step": 23 }, { "completion_length": 386.7291793823242, "epoch": 0.004266666666666667, "grad_norm": 0.5231775627781411, "kl": 0.15594482421875, "learning_rate": 9.987025851452636e-07, "loss": 0.0062, "reward": 0.1666666716337204, "reward_std": 0.3707359507679939, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.0833333358168602, "step": 24 }, { "completion_length": 389.8541793823242, "epoch": 0.0044444444444444444, "grad_norm": 0.685343943475841, "kl": 0.3055419921875, "learning_rate": 9.984302706688961e-07, "loss": 0.0122, "reward": 0.20833333767950535, "reward_std": 0.42428741604089737, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.0833333358168602, "step": 25 }, { "completion_length": 427.4166717529297, "epoch": 0.004622222222222222, "grad_norm": 0.40943932820956164, "kl": 0.3759765625, "learning_rate": 9.981320782765846e-07, "loss": 0.0151, "reward": 0.20833333767950535, "reward_std": 0.35973768681287766, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.0833333358168602, "step": 26 }, { "completion_length": 360.0208435058594, "epoch": 0.0048, "grad_norm": 4.089935587345267, "kl": 0.62060546875, "learning_rate": 9.978080234501292e-07, "loss": 0.0248, "reward": 0.31250000931322575, "reward_std": 0.4177170805633068, "rewards/equation_reward_func": 0.18750000186264515, "rewards/format_reward_func": 0.1250000037252903, "step": 27 }, { "completion_length": 446.8125114440918, "epoch": 0.004977777777777778, "grad_norm": 0.39386940795175784, "kl": 0.4873046875, "learning_rate": 9.974581230140768e-07, "loss": 0.0195, "reward": 0.06250000186264515, "reward_std": 0.1530931144952774, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 0.0416666679084301, "step": 28 }, { "completion_length": 358.2916774749756, "epoch": 0.005155555555555556, "grad_norm": 0.47853926879988334, "kl": 0.381591796875, "learning_rate": 9.970823951348486e-07, "loss": 0.0153, "reward": 0.43750001676380634, "reward_std": 0.5070193596184254, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.25000000186264515, "step": 29 }, { "completion_length": 392.0625190734863, "epoch": 0.005333333333333333, "grad_norm": 0.9169421130733736, "kl": 0.2674560546875, "learning_rate": 9.966808593197956e-07, "loss": 0.0107, "reward": 0.29166667349636555, "reward_std": 0.45132481306791306, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.14583333767950535, "step": 30 }, { "completion_length": 402.68750762939453, "epoch": 0.005511111111111111, "grad_norm": 0.48152738156086744, "kl": 0.390869140625, "learning_rate": 9.962535364161878e-07, "loss": 0.0157, "reward": 0.2500000074505806, "reward_std": 0.3977733328938484, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.16666666977107525, "step": 31 }, { "completion_length": 397.1875114440918, "epoch": 0.005688888888888889, "grad_norm": 0.5358456569457446, "kl": 0.632568359375, "learning_rate": 9.958004486101293e-07, "loss": 0.0253, "reward": 0.14583333767950535, "reward_std": 0.3572172485291958, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.0416666679084301, "step": 32 }, { "completion_length": 395.5000114440918, "epoch": 0.005866666666666667, "grad_norm": 0.45393406597096747, "kl": 0.548828125, "learning_rate": 9.953216194254085e-07, "loss": 0.0219, "reward": 0.3125000074505806, "reward_std": 0.4986758381128311, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.1250000037252903, "step": 33 }, { "completion_length": 418.12500762939453, "epoch": 0.006044444444444444, "grad_norm": 1.721174464990145, "kl": 0.89892578125, "learning_rate": 9.948170737222762e-07, "loss": 0.0359, "reward": 0.2083333358168602, "reward_std": 0.37717197462916374, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.1458333358168602, "step": 34 }, { "completion_length": 404.9791793823242, "epoch": 0.006222222222222222, "grad_norm": 0.42311176292986397, "kl": 0.53369140625, "learning_rate": 9.94286837696154e-07, "loss": 0.0213, "reward": 0.31250000931322575, "reward_std": 0.45383426919579506, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.16666666977107525, "step": 35 }, { "completion_length": 424.4791793823242, "epoch": 0.0064, "grad_norm": 0.7438227989937438, "kl": 0.407958984375, "learning_rate": 9.937309388762758e-07, "loss": 0.0163, "reward": 0.1250000037252903, "reward_std": 0.306186206638813, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.0833333358168602, "step": 36 }, { "completion_length": 432.0833435058594, "epoch": 0.006577777777777778, "grad_norm": 0.5301151073899182, "kl": 0.273681640625, "learning_rate": 9.931494061242571e-07, "loss": 0.0109, "reward": 0.33333333767950535, "reward_std": 0.5959400944411755, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.1666666716337204, "step": 37 }, { "completion_length": 389.27084732055664, "epoch": 0.0067555555555555554, "grad_norm": 0.6632742047974075, "kl": 0.218505859375, "learning_rate": 9.925422696325974e-07, "loss": 0.0087, "reward": 0.43750000931322575, "reward_std": 0.6827219277620316, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.2291666716337204, "step": 38 }, { "completion_length": 353.2708435058594, "epoch": 0.006933333333333333, "grad_norm": 0.391058687294098, "kl": 0.21881103515625, "learning_rate": 9.919095609231123e-07, "loss": 0.0087, "reward": 0.5000000167638063, "reward_std": 0.5392209477722645, "rewards/equation_reward_func": 0.3125000074505806, "rewards/format_reward_func": 0.18750000558793545, "step": 39 }, { "completion_length": 407.3333435058594, "epoch": 0.0071111111111111115, "grad_norm": 0.469200729103397, "kl": 0.32666015625, "learning_rate": 9.912513128452973e-07, "loss": 0.0131, "reward": 0.2708333358168602, "reward_std": 0.48175449296832085, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.1458333358168602, "step": 40 }, { "completion_length": 398.5208435058594, "epoch": 0.007288888888888889, "grad_norm": 0.4795098047053158, "kl": 0.3126220703125, "learning_rate": 9.905675595746213e-07, "loss": 0.0125, "reward": 0.479166679084301, "reward_std": 0.6324757561087608, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.27083333767950535, "step": 41 }, { "completion_length": 424.1458396911621, "epoch": 0.007466666666666667, "grad_norm": 0.517106705455673, "kl": 0.434326171875, "learning_rate": 9.898583366107536e-07, "loss": 0.0174, "reward": 0.2708333395421505, "reward_std": 0.3604965806007385, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.18750000186264515, "step": 42 }, { "completion_length": 312.1875114440918, "epoch": 0.007644444444444444, "grad_norm": 1.5051043389807441, "kl": 0.644775390625, "learning_rate": 9.8912368077572e-07, "loss": 0.0258, "reward": 0.6041666846722364, "reward_std": 0.7486668117344379, "rewards/equation_reward_func": 0.2083333358168602, "rewards/format_reward_func": 0.3958333395421505, "step": 43 }, { "completion_length": 392.2291793823242, "epoch": 0.007822222222222222, "grad_norm": 0.6184585307648679, "kl": 0.533447265625, "learning_rate": 9.88363630211991e-07, "loss": 0.0214, "reward": 0.33333334513008595, "reward_std": 0.5071536600589752, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.2083333395421505, "step": 44 }, { "completion_length": 403.43750953674316, "epoch": 0.008, "grad_norm": 0.4758095462502418, "kl": 0.531005859375, "learning_rate": 9.875782243805017e-07, "loss": 0.0213, "reward": 0.2708333395421505, "reward_std": 0.6148928552865982, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.1666666716337204, "step": 45 }, { "completion_length": 338.5625057220459, "epoch": 0.008177777777777779, "grad_norm": 0.5117314282157389, "kl": 0.47265625, "learning_rate": 9.867675040586033e-07, "loss": 0.0189, "reward": 0.45833334513008595, "reward_std": 0.6152770519256592, "rewards/equation_reward_func": 0.25000000186264515, "rewards/format_reward_func": 0.20833333767950535, "step": 46 }, { "completion_length": 361.04167556762695, "epoch": 0.008355555555555555, "grad_norm": 0.5540118176815925, "kl": 0.56494140625, "learning_rate": 9.859315113379452e-07, "loss": 0.0226, "reward": 0.2291666716337204, "reward_std": 0.4107687212526798, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.10416666977107525, "step": 47 }, { "completion_length": 376.60417556762695, "epoch": 0.008533333333333334, "grad_norm": 0.6882325279267957, "kl": 0.453369140625, "learning_rate": 9.850702896222908e-07, "loss": 0.0181, "reward": 0.37500001303851604, "reward_std": 0.5071536600589752, "rewards/equation_reward_func": 0.2083333358168602, "rewards/format_reward_func": 0.1666666716337204, "step": 48 }, { "completion_length": 371.43750762939453, "epoch": 0.00871111111111111, "grad_norm": 0.6655719658883651, "kl": 0.404541015625, "learning_rate": 9.841838836252625e-07, "loss": 0.0162, "reward": 0.2916666753590107, "reward_std": 0.49615539610385895, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.1875000037252903, "step": 49 }, { "completion_length": 301.39584159851074, "epoch": 0.008888888888888889, "grad_norm": 0.4870272919286295, "kl": 0.3218994140625, "learning_rate": 9.83272339368022e-07, "loss": 0.0129, "reward": 0.41666667349636555, "reward_std": 0.6257677860558033, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.2291666716337204, "step": 50 }, { "completion_length": 398.3333435058594, "epoch": 0.009066666666666667, "grad_norm": 0.28160060568929596, "kl": 0.2677001953125, "learning_rate": 9.823357041768796e-07, "loss": 0.0107, "reward": 0.16666666977107525, "reward_std": 0.32222534343600273, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.10416666977107525, "step": 51 }, { "completion_length": 366.0208396911621, "epoch": 0.009244444444444444, "grad_norm": 0.44161935224618987, "kl": 0.281494140625, "learning_rate": 9.813740266808373e-07, "loss": 0.0112, "reward": 0.27083333767950535, "reward_std": 0.455240398645401, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.14583333767950535, "step": 52 }, { "completion_length": 289.2083396911621, "epoch": 0.009422222222222222, "grad_norm": 0.4634094627007982, "kl": 0.295166015625, "learning_rate": 9.803873568090647e-07, "loss": 0.0118, "reward": 0.5625000037252903, "reward_std": 0.6678489372134209, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.35416667349636555, "step": 53 }, { "completion_length": 247.25000762939453, "epoch": 0.0096, "grad_norm": 0.5186115988333515, "kl": 0.3134765625, "learning_rate": 9.793757457883061e-07, "loss": 0.0125, "reward": 0.5208333525806665, "reward_std": 0.6603549271821976, "rewards/equation_reward_func": 0.25000000558793545, "rewards/format_reward_func": 0.27083333767950535, "step": 54 }, { "completion_length": 315.91667556762695, "epoch": 0.009777777777777778, "grad_norm": 0.5492950399694214, "kl": 0.3515625, "learning_rate": 9.783392461402207e-07, "loss": 0.0141, "reward": 0.39583334513008595, "reward_std": 0.6474834568798542, "rewards/equation_reward_func": 0.22916667349636555, "rewards/format_reward_func": 0.1666666716337204, "step": 55 }, { "completion_length": 309.16666984558105, "epoch": 0.009955555555555556, "grad_norm": 0.5069091296779727, "kl": 0.338623046875, "learning_rate": 9.772779116786567e-07, "loss": 0.0136, "reward": 0.5833333525806665, "reward_std": 0.6927030570805073, "rewards/equation_reward_func": 0.2500000037252903, "rewards/format_reward_func": 0.33333333767950535, "step": 56 }, { "completion_length": 271.520845413208, "epoch": 0.010133333333333333, "grad_norm": 0.4630715136373293, "kl": 0.3236083984375, "learning_rate": 9.761917975068563e-07, "loss": 0.013, "reward": 0.6250000074505806, "reward_std": 0.661014586687088, "rewards/equation_reward_func": 0.25000000558793545, "rewards/format_reward_func": 0.3750000074505806, "step": 57 }, { "completion_length": 296.2916717529297, "epoch": 0.010311111111111111, "grad_norm": 0.652647892352053, "kl": 0.392822265625, "learning_rate": 9.750809600145952e-07, "loss": 0.0157, "reward": 0.5208333525806665, "reward_std": 0.6347126960754395, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.39583334140479565, "step": 58 }, { "completion_length": 282.5000047683716, "epoch": 0.01048888888888889, "grad_norm": 0.5320134109583597, "kl": 0.41748046875, "learning_rate": 9.739454568752555e-07, "loss": 0.0167, "reward": 0.645833358168602, "reward_std": 0.7514187395572662, "rewards/equation_reward_func": 0.2083333358168602, "rewards/format_reward_func": 0.4375000111758709, "step": 59 }, { "completion_length": 317.3541793823242, "epoch": 0.010666666666666666, "grad_norm": 0.7774866396355492, "kl": 0.62548828125, "learning_rate": 9.7278534704283e-07, "loss": 0.025, "reward": 0.5625000111758709, "reward_std": 0.6851340346038342, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.37500000558793545, "step": 60 }, { "completion_length": 325.020845413208, "epoch": 0.010844444444444445, "grad_norm": 1.0948810326989156, "kl": 0.64453125, "learning_rate": 9.716006907488628e-07, "loss": 0.0258, "reward": 0.5208333488553762, "reward_std": 0.605813056230545, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.4583333432674408, "step": 61 }, { "completion_length": 244.68750762939453, "epoch": 0.011022222222222221, "grad_norm": 1.563284310620131, "kl": 0.613525390625, "learning_rate": 9.703915494993213e-07, "loss": 0.0245, "reward": 0.7916666846722364, "reward_std": 0.7340253219008446, "rewards/equation_reward_func": 0.22916666977107525, "rewards/format_reward_func": 0.5625000167638063, "step": 62 }, { "completion_length": 282.50000762939453, "epoch": 0.0112, "grad_norm": 2.1064327877521154, "kl": 1.42578125, "learning_rate": 9.691579860714032e-07, "loss": 0.057, "reward": 0.5625000149011612, "reward_std": 0.6274054050445557, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.500000013038516, "step": 63 }, { "completion_length": 194.25000667572021, "epoch": 0.011377777777777778, "grad_norm": 1.7833461932829624, "kl": 1.71484375, "learning_rate": 9.67900064510277e-07, "loss": 0.0686, "reward": 0.8125000260770321, "reward_std": 0.6131136827170849, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.6458333544433117, "step": 64 }, { "completion_length": 187.25000286102295, "epoch": 0.011555555555555555, "grad_norm": 3.6199284336418573, "kl": 1.58935546875, "learning_rate": 9.666178501257572e-07, "loss": 0.0635, "reward": 0.7708333618938923, "reward_std": 0.48367293551564217, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.6666666828095913, "step": 65 }, { "completion_length": 186.5625057220459, "epoch": 0.011733333333333333, "grad_norm": 6.170181718041983, "kl": 2.625, "learning_rate": 9.653114094889126e-07, "loss": 0.1052, "reward": 0.7916667051613331, "reward_std": 0.6234788559377193, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.6458333544433117, "step": 66 }, { "completion_length": 172.43750381469727, "epoch": 0.011911111111111112, "grad_norm": 3.3588942097695202, "kl": 1.7451171875, "learning_rate": 9.639808104286116e-07, "loss": 0.0698, "reward": 0.8541667014360428, "reward_std": 0.753696121275425, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.6458333507180214, "step": 67 }, { "completion_length": 167.35416984558105, "epoch": 0.012088888888888889, "grad_norm": 1.2101146404816208, "kl": 1.1162109375, "learning_rate": 9.626261220279987e-07, "loss": 0.0447, "reward": 0.9166667051613331, "reward_std": 0.6435392610728741, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.7291666977107525, "step": 68 }, { "completion_length": 153.66667079925537, "epoch": 0.012266666666666667, "grad_norm": 1.0564151985609145, "kl": 0.7333984375, "learning_rate": 9.612474146209095e-07, "loss": 0.0294, "reward": 1.0416666865348816, "reward_std": 0.4375460147857666, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.8541666865348816, "step": 69 }, { "completion_length": 156.87500762939453, "epoch": 0.012444444444444444, "grad_norm": 1.2014127678752624, "kl": 1.32470703125, "learning_rate": 9.598447597882179e-07, "loss": 0.053, "reward": 0.6875000223517418, "reward_std": 0.6247506737709045, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.604166679084301, "step": 70 }, { "completion_length": 124.20833778381348, "epoch": 0.012622222222222222, "grad_norm": 1.3939165689430868, "kl": 1.7919921875, "learning_rate": 9.584182303541204e-07, "loss": 0.0716, "reward": 0.7916667014360428, "reward_std": 0.5915197134017944, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.708333358168602, "step": 71 }, { "completion_length": 120.33333683013916, "epoch": 0.0128, "grad_norm": 4.070697361675846, "kl": 2.76123046875, "learning_rate": 9.56967900382354e-07, "loss": 0.1107, "reward": 0.9166666939854622, "reward_std": 0.681879960000515, "rewards/equation_reward_func": 0.2083333395421505, "rewards/format_reward_func": 0.7083333507180214, "step": 72 }, { "completion_length": 112.27083587646484, "epoch": 0.012977777777777777, "grad_norm": 5.057509148199061, "kl": 3.8662109375, "learning_rate": 9.55493845172353e-07, "loss": 0.1549, "reward": 0.8125000223517418, "reward_std": 0.6485148780047894, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.6666666828095913, "step": 73 }, { "completion_length": 100.75000381469727, "epoch": 0.013155555555555556, "grad_norm": 1.6660490540082615, "kl": 1.98828125, "learning_rate": 9.539961412553374e-07, "loss": 0.0795, "reward": 0.8750000149011612, "reward_std": 0.5296371467411518, "rewards/equation_reward_func": 0.2083333358168602, "rewards/format_reward_func": 0.6666666865348816, "step": 74 }, { "completion_length": 113.87500190734863, "epoch": 0.013333333333333334, "grad_norm": 1.6938950842175522, "kl": 1.33642578125, "learning_rate": 9.524748663903406e-07, "loss": 0.0535, "reward": 0.8125000223517418, "reward_std": 0.5695351995527744, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.6875000223517418, "step": 75 }, { "completion_length": 97.41666889190674, "epoch": 0.013511111111111111, "grad_norm": 1.8987706737359447, "kl": 1.123291015625, "learning_rate": 9.509300995601719e-07, "loss": 0.045, "reward": 0.7916666902601719, "reward_std": 0.533297847956419, "rewards/equation_reward_func": 0.1041666679084301, "rewards/format_reward_func": 0.6875000186264515, "step": 76 }, { "completion_length": 98.72916984558105, "epoch": 0.01368888888888889, "grad_norm": 3.104011972298094, "kl": 2.25341796875, "learning_rate": 9.493619209673163e-07, "loss": 0.0902, "reward": 0.8541666939854622, "reward_std": 0.7043437324464321, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.666666679084301, "step": 77 }, { "completion_length": 89.43750286102295, "epoch": 0.013866666666666666, "grad_norm": 1.9133019088326877, "kl": 1.931640625, "learning_rate": 9.477704120297696e-07, "loss": 0.0773, "reward": 0.7708333618938923, "reward_std": 0.6641731485724449, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.5833333544433117, "step": 78 }, { "completion_length": 93.45833587646484, "epoch": 0.014044444444444444, "grad_norm": 1.5024014724527306, "kl": 1.484130859375, "learning_rate": 9.461556553768123e-07, "loss": 0.0593, "reward": 0.8125000149011612, "reward_std": 0.6717319972813129, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.6250000223517418, "step": 79 }, { "completion_length": 97.62500286102295, "epoch": 0.014222222222222223, "grad_norm": 1.3010218844311383, "kl": 2.3037109375, "learning_rate": 9.445177348447186e-07, "loss": 0.0922, "reward": 0.7291666865348816, "reward_std": 0.6450728215277195, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.6250000149011612, "step": 80 }, { "completion_length": 93.02083587646484, "epoch": 0.0144, "grad_norm": 1.4449550603366121, "kl": 1.0791015625, "learning_rate": 9.428567354724045e-07, "loss": 0.0432, "reward": 0.8125000298023224, "reward_std": 0.5830309242010117, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.6875000186264515, "step": 81 }, { "completion_length": 86.27083587646484, "epoch": 0.014577777777777778, "grad_norm": 1.6082644108612607, "kl": 2.0986328125, "learning_rate": 9.41172743497012e-07, "loss": 0.0841, "reward": 0.7916666828095913, "reward_std": 0.6250231899321079, "rewards/equation_reward_func": 0.2083333395421505, "rewards/format_reward_func": 0.5833333488553762, "step": 82 }, { "completion_length": 81.50000238418579, "epoch": 0.014755555555555555, "grad_norm": 3.1983476547534795, "kl": 3.3251953125, "learning_rate": 9.394658463494327e-07, "loss": 0.1331, "reward": 0.8750000298023224, "reward_std": 0.49993259087204933, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.7500000149011612, "step": 83 }, { "completion_length": 80.02083587646484, "epoch": 0.014933333333333333, "grad_norm": 5.509760052892182, "kl": 4.7392578125, "learning_rate": 9.377361326497673e-07, "loss": 0.1899, "reward": 0.7708333525806665, "reward_std": 0.6075604781508446, "rewards/equation_reward_func": 0.1666666679084301, "rewards/format_reward_func": 0.6041666809469461, "step": 84 }, { "completion_length": 82.25000238418579, "epoch": 0.015111111111111112, "grad_norm": 3.5108499823149404, "kl": 3.107421875, "learning_rate": 9.359836922027254e-07, "loss": 0.1244, "reward": 0.7291666828095913, "reward_std": 0.5941584445536137, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.6458333544433117, "step": 85 }, { "completion_length": 82.41666984558105, "epoch": 0.015288888888888888, "grad_norm": 2.028459417088561, "kl": 1.29931640625, "learning_rate": 9.342086159929629e-07, "loss": 0.052, "reward": 0.9166666977107525, "reward_std": 0.6568441018462181, "rewards/equation_reward_func": 0.2291666679084301, "rewards/format_reward_func": 0.6875000186264515, "step": 86 }, { "completion_length": 76.08333683013916, "epoch": 0.015466666666666667, "grad_norm": 1.3916338154872232, "kl": 0.656005859375, "learning_rate": 9.324109961803577e-07, "loss": 0.0262, "reward": 0.9166666865348816, "reward_std": 0.46873048692941666, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.7500000149011612, "step": 87 }, { "completion_length": 83.31250381469727, "epoch": 0.015644444444444443, "grad_norm": 1.5055362976480897, "kl": 0.97509765625, "learning_rate": 9.305909260952254e-07, "loss": 0.039, "reward": 0.8541666865348816, "reward_std": 0.5454338155686855, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.7500000223517418, "step": 88 }, { "completion_length": 83.79166984558105, "epoch": 0.015822222222222224, "grad_norm": 1.1633318801455976, "kl": 1.281494140625, "learning_rate": 9.287485002334732e-07, "loss": 0.0512, "reward": 0.8750000223517418, "reward_std": 0.4783512242138386, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.7291666939854622, "step": 89 }, { "completion_length": 78.64583587646484, "epoch": 0.016, "grad_norm": 2.9144797063041006, "kl": 2.5732421875, "learning_rate": 9.268838142516943e-07, "loss": 0.1028, "reward": 0.7916666865348816, "reward_std": 0.652060579508543, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.6458333469927311, "step": 90 }, { "completion_length": 81.66666984558105, "epoch": 0.016177777777777777, "grad_norm": 4.751929306174548, "kl": 4.068359375, "learning_rate": 9.249969649622012e-07, "loss": 0.1627, "reward": 0.6250000149011612, "reward_std": 0.6327161639928818, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.5416666716337204, "step": 91 }, { "completion_length": 82.79166889190674, "epoch": 0.016355555555555557, "grad_norm": 2.236969609049743, "kl": 2.87109375, "learning_rate": 9.23088050327999e-07, "loss": 0.1146, "reward": 0.8125000223517418, "reward_std": 0.6459339037537575, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.6875000223517418, "step": 92 }, { "completion_length": 82.45833587646484, "epoch": 0.016533333333333334, "grad_norm": 2.000499326379198, "kl": 1.4482421875, "learning_rate": 9.211571694577004e-07, "loss": 0.0579, "reward": 0.8958333507180214, "reward_std": 0.6133542768657207, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.7083333544433117, "step": 93 }, { "completion_length": 84.58333587646484, "epoch": 0.01671111111111111, "grad_norm": 1.2548765801344943, "kl": 0.78173828125, "learning_rate": 9.192044226003788e-07, "loss": 0.0313, "reward": 0.8750000298023224, "reward_std": 0.5373301059007645, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.770833358168602, "step": 94 }, { "completion_length": 81.95833587646484, "epoch": 0.016888888888888887, "grad_norm": 1.5059931362189294, "kl": 1.111083984375, "learning_rate": 9.172299111403641e-07, "loss": 0.0444, "reward": 1.0000000298023224, "reward_std": 0.4846614636480808, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.8541666865348816, "step": 95 }, { "completion_length": 84.04166889190674, "epoch": 0.017066666666666667, "grad_norm": 1.3359826696515187, "kl": 1.60986328125, "learning_rate": 9.15233737591979e-07, "loss": 0.0643, "reward": 0.9166667014360428, "reward_std": 0.5707925632596016, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.7708333507180214, "step": 96 }, { "completion_length": 86.62500286102295, "epoch": 0.017244444444444444, "grad_norm": 1.8097820481931228, "kl": 1.9326171875, "learning_rate": 9.132160055942164e-07, "loss": 0.0773, "reward": 0.9166666902601719, "reward_std": 0.4999736212193966, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.7708333488553762, "step": 97 }, { "completion_length": 77.75000190734863, "epoch": 0.01742222222222222, "grad_norm": 1.6992570227449872, "kl": 1.365478515625, "learning_rate": 9.111768199053586e-07, "loss": 0.0546, "reward": 0.9791666939854622, "reward_std": 0.5367976725101471, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8333333507180214, "step": 98 }, { "completion_length": 70.70833492279053, "epoch": 0.0176, "grad_norm": 12.904518500107274, "kl": 1.700439453125, "learning_rate": 9.091162863975388e-07, "loss": 0.0681, "reward": 1.0833333879709244, "reward_std": 0.5429981462657452, "rewards/equation_reward_func": 0.2500000074505806, "rewards/format_reward_func": 0.8333333507180214, "step": 99 }, { "completion_length": 74.75000333786011, "epoch": 0.017777777777777778, "grad_norm": 2.68173021529566, "kl": 1.787109375, "learning_rate": 9.070345120512435e-07, "loss": 0.0715, "reward": 0.9583333656191826, "reward_std": 0.4984116442501545, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8125000223517418, "step": 100 }, { "completion_length": 70.83333492279053, "epoch": 0.017955555555555554, "grad_norm": 1.405751831531719, "kl": 1.2646484375, "learning_rate": 9.049316049497587e-07, "loss": 0.0506, "reward": 0.8750000223517418, "reward_std": 0.39079636335372925, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.833333358168602, "step": 101 }, { "completion_length": 68.66666889190674, "epoch": 0.018133333333333335, "grad_norm": 1.4351105118065817, "kl": 1.413818359375, "learning_rate": 9.028076742735582e-07, "loss": 0.0566, "reward": 0.8958333730697632, "reward_std": 0.5925082266330719, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.770833358168602, "step": 102 }, { "completion_length": 66.25000286102295, "epoch": 0.01831111111111111, "grad_norm": 2.635755872511972, "kl": 1.2666015625, "learning_rate": 9.006628302946357e-07, "loss": 0.0507, "reward": 0.8958333656191826, "reward_std": 0.5383186265826225, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.7916666939854622, "step": 103 }, { "completion_length": 66.04166889190674, "epoch": 0.018488888888888888, "grad_norm": 1.2582201461515905, "kl": 0.912353515625, "learning_rate": 8.984971843707787e-07, "loss": 0.0365, "reward": 0.916666679084301, "reward_std": 0.3247256837785244, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.8750000223517418, "step": 104 }, { "completion_length": 65.89583539962769, "epoch": 0.018666666666666668, "grad_norm": 1.970155942749344, "kl": 0.91162109375, "learning_rate": 8.963108489397875e-07, "loss": 0.0364, "reward": 0.9166666865348816, "reward_std": 0.33336182311177254, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.854166679084301, "step": 105 }, { "completion_length": 64.52083396911621, "epoch": 0.018844444444444445, "grad_norm": 1.3485959618130419, "kl": 0.93115234375, "learning_rate": 8.94103937513637e-07, "loss": 0.0372, "reward": 0.9375000298023224, "reward_std": 0.2982207238674164, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.8958333507180214, "step": 106 }, { "completion_length": 63.06250190734863, "epoch": 0.01902222222222222, "grad_norm": 2.0728028572795543, "kl": 1.34375, "learning_rate": 8.918765646725843e-07, "loss": 0.0538, "reward": 0.937500037252903, "reward_std": 0.31970490887761116, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.8958333507180214, "step": 107 }, { "completion_length": 61.979167461395264, "epoch": 0.0192, "grad_norm": 2.225767300026043, "kl": 2.87646484375, "learning_rate": 8.896288460592185e-07, "loss": 0.1152, "reward": 0.8750000223517418, "reward_std": 0.2861081585288048, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 0.854166679084301, "step": 108 }, { "completion_length": 57.812501430511475, "epoch": 0.01937777777777778, "grad_norm": 1.3463463862492873, "kl": 1.247802734375, "learning_rate": 8.873608983724579e-07, "loss": 0.05, "reward": 1.0208333656191826, "reward_std": 0.4833719953894615, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8750000149011612, "step": 109 }, { "completion_length": 58.250001430511475, "epoch": 0.019555555555555555, "grad_norm": 1.666008392324381, "kl": 1.627685546875, "learning_rate": 8.850728393614901e-07, "loss": 0.0651, "reward": 0.9375000149011612, "reward_std": 0.4140563830733299, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8333333507180214, "step": 110 }, { "completion_length": 56.85416841506958, "epoch": 0.019733333333333332, "grad_norm": 0.9130518937737289, "kl": 0.876220703125, "learning_rate": 8.8276478781966e-07, "loss": 0.035, "reward": 1.0208333805203438, "reward_std": 0.4418273940682411, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8958333507180214, "step": 111 }, { "completion_length": 53.229167461395264, "epoch": 0.019911111111111112, "grad_norm": 2.1598126065522223, "kl": 0.886474609375, "learning_rate": 8.804368635783002e-07, "loss": 0.0355, "reward": 0.9583333730697632, "reward_std": 0.38524314761161804, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.895833358168602, "step": 112 }, { "completion_length": 54.791667461395264, "epoch": 0.02008888888888889, "grad_norm": 1.4035187794909976, "kl": 1.21337890625, "learning_rate": 8.780891875005114e-07, "loss": 0.0487, "reward": 0.9375000298023224, "reward_std": 0.28219257295131683, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.8958333432674408, "step": 113 }, { "completion_length": 49.97916793823242, "epoch": 0.020266666666666665, "grad_norm": 1.8833740064941022, "kl": 1.38671875, "learning_rate": 8.75721881474886e-07, "loss": 0.0555, "reward": 1.0416667014360428, "reward_std": 0.38524315133690834, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.9166666865348816, "step": 114 }, { "completion_length": 48.83333444595337, "epoch": 0.020444444444444446, "grad_norm": 1.3339255087510238, "kl": 0.90771484375, "learning_rate": 8.733350684091805e-07, "loss": 0.0363, "reward": 0.8333333544433117, "reward_std": 0.4297148324549198, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.7708333544433117, "step": 115 }, { "completion_length": 50.875000953674316, "epoch": 0.020622222222222222, "grad_norm": 1.1195993054785263, "kl": 0.71923828125, "learning_rate": 8.709288722239342e-07, "loss": 0.0288, "reward": 1.0625000447034836, "reward_std": 0.4778187908232212, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.9166666865348816, "step": 116 }, { "completion_length": 53.10416793823242, "epoch": 0.0208, "grad_norm": 0.8633281856230262, "kl": 1.0654296875, "learning_rate": 8.685034178460353e-07, "loss": 0.0427, "reward": 1.0000000596046448, "reward_std": 0.4513138346374035, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.895833358168602, "step": 117 }, { "completion_length": 49.85416841506958, "epoch": 0.02097777777777778, "grad_norm": 0.6227616661901212, "kl": 0.8115234375, "learning_rate": 8.660588312022343e-07, "loss": 0.0324, "reward": 1.0625000298023224, "reward_std": 0.3397653251886368, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.9375000074505806, "step": 118 }, { "completion_length": 55.437500953674316, "epoch": 0.021155555555555556, "grad_norm": 0.9190492054206527, "kl": 0.992919921875, "learning_rate": 8.635952392126071e-07, "loss": 0.0397, "reward": 0.9583333432674408, "reward_std": 0.24161884933710098, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8958333432674408, "step": 119 }, { "completion_length": 53.72916793823242, "epoch": 0.021333333333333333, "grad_norm": 0.8309087697870374, "kl": 0.88134765625, "learning_rate": 8.611127697839647e-07, "loss": 0.0352, "reward": 1.1041667014360428, "reward_std": 0.36124950274825096, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.9583333432674408, "step": 120 }, { "completion_length": 48.89583492279053, "epoch": 0.021511111111111113, "grad_norm": 1.145524145717797, "kl": 1.109375, "learning_rate": 8.586115518032126e-07, "loss": 0.0444, "reward": 1.0000000298023224, "reward_std": 0.3747681975364685, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8958333432674408, "step": 121 }, { "completion_length": 53.83333444595337, "epoch": 0.02168888888888889, "grad_norm": 1.291809265288758, "kl": 1.53271484375, "learning_rate": 8.560917151306592e-07, "loss": 0.0613, "reward": 1.125000037252903, "reward_std": 0.4393179304897785, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.9375000074505806, "step": 122 }, { "completion_length": 56.89583492279053, "epoch": 0.021866666666666666, "grad_norm": 1.7427992065174527, "kl": 1.25, "learning_rate": 8.535533905932737e-07, "loss": 0.05, "reward": 1.1041666865348816, "reward_std": 0.4188222736120224, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.9166666865348816, "step": 123 }, { "completion_length": 57.666667461395264, "epoch": 0.022044444444444443, "grad_norm": 1.0883926763944543, "kl": 0.804931640625, "learning_rate": 8.509967099778933e-07, "loss": 0.0322, "reward": 1.1250000596046448, "reward_std": 0.4283087030053139, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.9583333432674408, "step": 124 }, { "completion_length": 54.97916793823242, "epoch": 0.022222222222222223, "grad_norm": 1.3364813622555163, "kl": 1.42626953125, "learning_rate": 8.484218060243815e-07, "loss": 0.057, "reward": 0.937500037252903, "reward_std": 0.3987618461251259, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8750000223517418, "step": 125 }, { "completion_length": 59.14583492279053, "epoch": 0.0224, "grad_norm": 1.025442216438611, "kl": 1.20263671875, "learning_rate": 8.458288124187358e-07, "loss": 0.0482, "reward": 1.0208333730697632, "reward_std": 0.4563346207141876, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.895833358168602, "step": 126 }, { "completion_length": 50.00000190734863, "epoch": 0.022577777777777776, "grad_norm": 4.84373478962823, "kl": 3.6640625, "learning_rate": 8.432178637861483e-07, "loss": 0.1463, "reward": 0.8333333507180214, "reward_std": 0.4152076132595539, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.7708333432674408, "step": 127 }, { "completion_length": 59.47916841506958, "epoch": 0.022755555555555557, "grad_norm": 2.7058062346789944, "kl": 1.1728515625, "learning_rate": 8.405890956840135e-07, "loss": 0.0469, "reward": 1.1458333507180214, "reward_std": 0.39611808210611343, "rewards/equation_reward_func": 0.22916666977107525, "rewards/format_reward_func": 0.916666679084301, "step": 128 }, { "completion_length": 61.91666841506958, "epoch": 0.022933333333333333, "grad_norm": 2.4631575245463204, "kl": 2.07666015625, "learning_rate": 8.379426445948932e-07, "loss": 0.0831, "reward": 1.145833358168602, "reward_std": 0.5193633921444416, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 0.9166666865348816, "step": 129 }, { "completion_length": 66.70833587646484, "epoch": 0.02311111111111111, "grad_norm": 1.230784921117314, "kl": 1.10009765625, "learning_rate": 8.352786479194287e-07, "loss": 0.044, "reward": 1.062500037252903, "reward_std": 0.32525811344385147, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.9375000074505806, "step": 130 }, { "completion_length": 64.60416889190674, "epoch": 0.02328888888888889, "grad_norm": 1.6700411140187803, "kl": 1.74267578125, "learning_rate": 8.325972439692074e-07, "loss": 0.0696, "reward": 1.0833333656191826, "reward_std": 0.5055268332362175, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.8750000149011612, "step": 131 }, { "completion_length": 66.66666889190674, "epoch": 0.023466666666666667, "grad_norm": 1.3772388429149782, "kl": 0.913818359375, "learning_rate": 8.298985719595823e-07, "loss": 0.0366, "reward": 1.0416667014360428, "reward_std": 0.3589930906891823, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.916666679084301, "step": 132 }, { "completion_length": 69.39583539962769, "epoch": 0.023644444444444444, "grad_norm": 3.244989830519802, "kl": 1.36328125, "learning_rate": 8.271827720024438e-07, "loss": 0.0545, "reward": 1.2083333730697632, "reward_std": 0.4152076169848442, "rewards/equation_reward_func": 0.25000000558793545, "rewards/format_reward_func": 0.9583333432674408, "step": 133 }, { "completion_length": 72.12500286102295, "epoch": 0.023822222222222224, "grad_norm": 2.0362764803071363, "kl": 1.7900390625, "learning_rate": 8.244499850989451e-07, "loss": 0.0715, "reward": 1.0000000521540642, "reward_std": 0.5644823275506496, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.833333358168602, "step": 134 }, { "completion_length": 77.43750381469727, "epoch": 0.024, "grad_norm": 0.9861591115726814, "kl": 0.634521484375, "learning_rate": 8.21700353132182e-07, "loss": 0.0254, "reward": 1.0625000298023224, "reward_std": 0.29669978097081184, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.9583333432674408, "step": 135 }, { "completion_length": 74.06250095367432, "epoch": 0.024177777777777777, "grad_norm": 0.9770205438349707, "kl": 0.664306640625, "learning_rate": 8.189340188598262e-07, "loss": 0.0266, "reward": 0.979166679084301, "reward_std": 0.2591874338686466, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.916666679084301, "step": 136 }, { "completion_length": 79.12500238418579, "epoch": 0.024355555555555554, "grad_norm": 2.110834692267425, "kl": 1.36279296875, "learning_rate": 8.161511259067132e-07, "loss": 0.0545, "reward": 0.8750000298023224, "reward_std": 0.3977733254432678, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.833333358168602, "step": 137 }, { "completion_length": 75.68750286102295, "epoch": 0.024533333333333334, "grad_norm": 1.8894230791150959, "kl": 1.076416015625, "learning_rate": 8.133518187573862e-07, "loss": 0.0431, "reward": 1.0625000298023224, "reward_std": 0.40168892964720726, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.9166666865348816, "step": 138 }, { "completion_length": 77.06250095367432, "epoch": 0.02471111111111111, "grad_norm": 2.8491636564476788, "kl": 1.8115234375, "learning_rate": 8.105362427485942e-07, "loss": 0.0725, "reward": 0.9791666939854622, "reward_std": 0.5409447588026524, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8333333656191826, "step": 139 }, { "completion_length": 80.18750190734863, "epoch": 0.024888888888888887, "grad_norm": 1.0644881637020627, "kl": 1.40087890625, "learning_rate": 8.077045440617464e-07, "loss": 0.0561, "reward": 0.9583333656191826, "reward_std": 0.268673874437809, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.916666679084301, "step": 140 }, { "completion_length": 80.12500190734863, "epoch": 0.025066666666666668, "grad_norm": 2.41329367830394, "kl": 2.546630859375, "learning_rate": 8.048568697153222e-07, "loss": 0.1021, "reward": 1.1250000298023224, "reward_std": 0.37628915905952454, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.9375000149011612, "step": 141 }, { "completion_length": 78.43750286102295, "epoch": 0.025244444444444444, "grad_norm": 2.160199231813677, "kl": 2.1884765625, "learning_rate": 8.019933675572388e-07, "loss": 0.0875, "reward": 1.0000000298023224, "reward_std": 0.2471896894276142, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.9375000149011612, "step": 142 }, { "completion_length": 80.20833587646484, "epoch": 0.02542222222222222, "grad_norm": 2.3799026133423715, "kl": 1.010009765625, "learning_rate": 7.991141862571749e-07, "loss": 0.0404, "reward": 1.0625000223517418, "reward_std": 0.3212082237005234, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.916666679084301, "step": 143 }, { "completion_length": 77.85416984558105, "epoch": 0.0256, "grad_norm": 2.3486898873600084, "kl": 2.1669921875, "learning_rate": 7.962194752988518e-07, "loss": 0.0868, "reward": 0.8958333656191826, "reward_std": 0.4273201934993267, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.833333358168602, "step": 144 }, { "completion_length": 81.31250286102295, "epoch": 0.025777777777777778, "grad_norm": 5.852632022601826, "kl": 0.934326171875, "learning_rate": 7.933093849722723e-07, "loss": 0.0374, "reward": 1.166666716337204, "reward_std": 0.41935470327734947, "rewards/equation_reward_func": 0.2083333395421505, "rewards/format_reward_func": 0.9583333432674408, "step": 145 }, { "completion_length": 79.81250190734863, "epoch": 0.025955555555555555, "grad_norm": 3.627317817877323, "kl": 1.385498046875, "learning_rate": 7.903840663659184e-07, "loss": 0.0555, "reward": 0.9791666865348816, "reward_std": 0.23215004801750183, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.9375000149011612, "step": 146 }, { "completion_length": 79.97916889190674, "epoch": 0.026133333333333335, "grad_norm": 3.3314477031020973, "kl": 0.618408203125, "learning_rate": 7.874436713589063e-07, "loss": 0.0248, "reward": 1.1250000298023224, "reward_std": 0.2957112640142441, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.9791666716337204, "step": 147 }, { "completion_length": 77.14583683013916, "epoch": 0.02631111111111111, "grad_norm": 1.5297879763799356, "kl": 1.740966796875, "learning_rate": 7.844883526131013e-07, "loss": 0.0696, "reward": 1.0208333656191826, "reward_std": 0.4257992319762707, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8958333507180214, "step": 148 }, { "completion_length": 74.06250238418579, "epoch": 0.026488888888888888, "grad_norm": 6.233097760863968, "kl": 3.5986328125, "learning_rate": 7.815182635651912e-07, "loss": 0.1439, "reward": 0.9375000298023224, "reward_std": 0.3627704530954361, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8750000223517418, "step": 149 }, { "completion_length": 73.02083539962769, "epoch": 0.02666666666666667, "grad_norm": 5.958845153705178, "kl": 4.09375, "learning_rate": 7.785335584187219e-07, "loss": 0.1641, "reward": 0.9375000223517418, "reward_std": 0.26070838794112206, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.8958333507180214, "step": 150 }, { "completion_length": 73.68750095367432, "epoch": 0.026844444444444445, "grad_norm": 2.37146983586077, "kl": 1.56298828125, "learning_rate": 7.755343921360886e-07, "loss": 0.0625, "reward": 0.958333358168602, "reward_std": 0.2742270827293396, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8958333432674408, "step": 151 }, { "completion_length": 78.08333587646484, "epoch": 0.027022222222222222, "grad_norm": 0.8124773136751642, "kl": 0.678466796875, "learning_rate": 7.725209204304928e-07, "loss": 0.0271, "reward": 0.937500037252903, "reward_std": 0.31970491260290146, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.8958333507180214, "step": 152 }, { "completion_length": 72.68750238418579, "epoch": 0.0272, "grad_norm": 1.8438522048436141, "kl": 0.5576171875, "learning_rate": 7.694932997578564e-07, "loss": 0.0223, "reward": 1.1250000298023224, "reward_std": 0.3796723149716854, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.9583333432674408, "step": 153 }, { "completion_length": 78.06250238418579, "epoch": 0.02737777777777778, "grad_norm": 2.3953021804030485, "kl": 1.169921875, "learning_rate": 7.664516873086987e-07, "loss": 0.0469, "reward": 1.1666667014360428, "reward_std": 0.44867006316781044, "rewards/equation_reward_func": 0.22916666977107525, "rewards/format_reward_func": 0.9375000149011612, "step": 154 }, { "completion_length": 77.79166984558105, "epoch": 0.027555555555555555, "grad_norm": 0.8215867255038918, "kl": 0.425048828125, "learning_rate": 7.633962409999764e-07, "loss": 0.017, "reward": 1.1041667014360428, "reward_std": 0.3397653251886368, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.9583333432674408, "step": 155 }, { "completion_length": 74.83333492279053, "epoch": 0.027733333333333332, "grad_norm": 0.7182201741238872, "kl": 0.4013671875, "learning_rate": 7.603271194668835e-07, "loss": 0.0161, "reward": 1.2291667014360428, "reward_std": 0.36417657881975174, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 1.0, "step": 156 }, { "completion_length": 77.02083683013916, "epoch": 0.027911111111111112, "grad_norm": 0.8227456410541343, "kl": 0.458984375, "learning_rate": 7.572444820546155e-07, "loss": 0.0184, "reward": 0.958333358168602, "reward_std": 0.20412414148449898, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 0.9375000149011612, "step": 157 }, { "completion_length": 73.06250190734863, "epoch": 0.02808888888888889, "grad_norm": 1.4714763458802167, "kl": 0.7119140625, "learning_rate": 7.541484888100973e-07, "loss": 0.0285, "reward": 0.9791666865348816, "reward_std": 0.4867551550269127, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.833333358168602, "step": 158 }, { "completion_length": 72.87500190734863, "epoch": 0.028266666666666666, "grad_norm": 1.4514512402843838, "kl": 0.871337890625, "learning_rate": 7.510393004736722e-07, "loss": 0.0349, "reward": 0.9791666939854622, "reward_std": 0.5999412871897221, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.812500037252903, "step": 159 }, { "completion_length": 67.31250143051147, "epoch": 0.028444444444444446, "grad_norm": 2.116607202051419, "kl": 1.553466796875, "learning_rate": 7.479170784707574e-07, "loss": 0.0621, "reward": 0.8333333544433117, "reward_std": 0.4538251422345638, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.7708333544433117, "step": 160 }, { "completion_length": 64.12500143051147, "epoch": 0.028622222222222223, "grad_norm": 4.988492970714175, "kl": 2.7998046875, "learning_rate": 7.447819849034628e-07, "loss": 0.1121, "reward": 0.666666692122817, "reward_std": 0.49578551203012466, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 0.6458333563059568, "step": 161 }, { "completion_length": 61.10416841506958, "epoch": 0.0288, "grad_norm": 6.132561250847984, "kl": 3.8466796875, "learning_rate": 7.416341825421753e-07, "loss": 0.1538, "reward": 0.5625000111758709, "reward_std": 0.5616070628166199, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 0.5416666753590107, "step": 162 }, { "completion_length": 62.312501430511475, "epoch": 0.02897777777777778, "grad_norm": 4.666254586473765, "kl": 3.7216796875, "learning_rate": 7.384738348171068e-07, "loss": 0.1486, "reward": 0.8541666939854622, "reward_std": 0.5078980773687363, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.7291666828095913, "step": 163 }, { "completion_length": 70.00000286102295, "epoch": 0.029155555555555556, "grad_norm": 1.4445543590308407, "kl": 1.42333984375, "learning_rate": 7.353011058098103e-07, "loss": 0.057, "reward": 0.937500037252903, "reward_std": 0.5709268674254417, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.7916666939854622, "step": 164 }, { "completion_length": 72.79167079925537, "epoch": 0.029333333333333333, "grad_norm": 1.8455254519389834, "kl": 1.66259765625, "learning_rate": 7.321161602446601e-07, "loss": 0.0666, "reward": 1.0416667014360428, "reward_std": 0.40530357509851456, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.9166666865348816, "step": 165 }, { "completion_length": 67.22916793823242, "epoch": 0.02951111111111111, "grad_norm": 1.2830647533382114, "kl": 0.5908203125, "learning_rate": 7.289191634803002e-07, "loss": 0.0236, "reward": 1.0000000298023224, "reward_std": 0.2831810861825943, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.9375000149011612, "step": 166 }, { "completion_length": 67.145836353302, "epoch": 0.02968888888888889, "grad_norm": 1.5595729319752896, "kl": 0.570068359375, "learning_rate": 7.257102815010584e-07, "loss": 0.0228, "reward": 1.1458333656191826, "reward_std": 0.4072421304881573, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.9375000074505806, "step": 167 }, { "completion_length": 65.50000190734863, "epoch": 0.029866666666666666, "grad_norm": 0.8324762943011298, "kl": 1.06201171875, "learning_rate": 7.224896809083297e-07, "loss": 0.0424, "reward": 1.1666667014360428, "reward_std": 0.37223926186561584, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.9583333432674408, "step": 168 }, { "completion_length": 62.729167461395264, "epoch": 0.030044444444444443, "grad_norm": 2.4594683537617437, "kl": 0.987060546875, "learning_rate": 7.192575289119245e-07, "loss": 0.0395, "reward": 1.0000000298023224, "reward_std": 0.39079635962843895, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8958333507180214, "step": 169 }, { "completion_length": 58.02083444595337, "epoch": 0.030222222222222223, "grad_norm": 1.7347800603929018, "kl": 2.14453125, "learning_rate": 7.160139933213898e-07, "loss": 0.0858, "reward": 1.1666667014360428, "reward_std": 0.3841203413903713, "rewards/equation_reward_func": 0.2083333358168602, "rewards/format_reward_func": 0.9583333432674408, "step": 170 }, { "completion_length": 55.604167461395264, "epoch": 0.0304, "grad_norm": 2.866447003547887, "kl": 3.4658203125, "learning_rate": 7.12759242537295e-07, "loss": 0.1386, "reward": 1.0000000223517418, "reward_std": 0.41085678339004517, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8750000223517418, "step": 171 }, { "completion_length": 53.104167461395264, "epoch": 0.030577777777777777, "grad_norm": 2.387493786129646, "kl": 4.9072265625, "learning_rate": 7.094934455424888e-07, "loss": 0.1961, "reward": 1.0416667088866234, "reward_std": 0.49133747816085815, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.8750000223517418, "step": 172 }, { "completion_length": 55.875001430511475, "epoch": 0.030755555555555557, "grad_norm": 2.9662179988283444, "kl": 2.531494140625, "learning_rate": 7.06216771893327e-07, "loss": 0.1012, "reward": 0.9375000223517418, "reward_std": 0.28219256922602654, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.8958333507180214, "step": 173 }, { "completion_length": 51.500001430511475, "epoch": 0.030933333333333334, "grad_norm": 1.3148679005802646, "kl": 2.0830078125, "learning_rate": 7.029293917108677e-07, "loss": 0.0833, "reward": 0.9166667014360428, "reward_std": 0.44083888083696365, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.8333333507180214, "step": 174 }, { "completion_length": 55.500000953674316, "epoch": 0.03111111111111111, "grad_norm": 1.940206757858531, "kl": 0.938232421875, "learning_rate": 6.996314756720408e-07, "loss": 0.0375, "reward": 1.1250000298023224, "reward_std": 0.3506578877568245, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.9583333432674408, "step": 175 }, { "completion_length": 57.583335399627686, "epoch": 0.03128888888888889, "grad_norm": 0.4970228156471284, "kl": 0.618408203125, "learning_rate": 6.963231950007844e-07, "loss": 0.0247, "reward": 1.208333358168602, "reward_std": 0.30354244261980057, "rewards/equation_reward_func": 0.2083333358168602, "rewards/format_reward_func": 1.0, "step": 176 }, { "completion_length": 57.22916793823242, "epoch": 0.031466666666666664, "grad_norm": 2.1203602590832387, "kl": 0.69189453125, "learning_rate": 6.930047214591568e-07, "loss": 0.0277, "reward": 1.2500000447034836, "reward_std": 0.47683026641607285, "rewards/equation_reward_func": 0.3125000074505806, "rewards/format_reward_func": 0.9375000149011612, "step": 177 }, { "completion_length": 54.47916793823242, "epoch": 0.03164444444444445, "grad_norm": 1.2499222352319177, "kl": 1.194091796875, "learning_rate": 6.896762273384178e-07, "loss": 0.0477, "reward": 1.0416667014360428, "reward_std": 0.46985330432653427, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.895833358168602, "step": 178 }, { "completion_length": 48.437500953674316, "epoch": 0.031822222222222224, "grad_norm": 1.861799414305649, "kl": 1.77197265625, "learning_rate": 6.863378854500845e-07, "loss": 0.0708, "reward": 0.937500037252903, "reward_std": 0.3627704605460167, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8750000149011612, "step": 179 }, { "completion_length": 52.33333492279053, "epoch": 0.032, "grad_norm": 1.4601874721448491, "kl": 1.70166015625, "learning_rate": 6.829898691169579e-07, "loss": 0.068, "reward": 0.9375000298023224, "reward_std": 0.27258946001529694, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.8958333432674408, "step": 180 }, { "completion_length": 52.47916793823242, "epoch": 0.03217777777777778, "grad_norm": 3.4537886700647316, "kl": 1.458251953125, "learning_rate": 6.796323521641256e-07, "loss": 0.0584, "reward": 1.0416667237877846, "reward_std": 0.5502192042768002, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.854166679084301, "step": 181 }, { "completion_length": 52.812501430511475, "epoch": 0.032355555555555554, "grad_norm": 1.1876803036822332, "kl": 1.65576171875, "learning_rate": 6.762655089099353e-07, "loss": 0.0663, "reward": 1.1041667014360428, "reward_std": 0.4688647910952568, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.9166666865348816, "step": 182 }, { "completion_length": 50.812501430511475, "epoch": 0.03253333333333333, "grad_norm": 3.60684735229285, "kl": 2.094482421875, "learning_rate": 6.728895141569462e-07, "loss": 0.0838, "reward": 1.020833358168602, "reward_std": 0.4737688973546028, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.8750000298023224, "step": 183 }, { "completion_length": 50.58333444595337, "epoch": 0.032711111111111114, "grad_norm": 5.166407984979809, "kl": 6.22412109375, "learning_rate": 6.695045431828524e-07, "loss": 0.2489, "reward": 0.9166667014360428, "reward_std": 0.44867006316781044, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8125000074505806, "step": 184 }, { "completion_length": 52.20833396911621, "epoch": 0.03288888888888889, "grad_norm": 5.5000432690940375, "kl": 6.056640625, "learning_rate": 6.661107717313823e-07, "loss": 0.2423, "reward": 1.020833358168602, "reward_std": 0.41912320628762245, "rewards/equation_reward_func": 0.14583333395421505, "rewards/format_reward_func": 0.8750000149011612, "step": 185 }, { "completion_length": 48.854167461395264, "epoch": 0.03306666666666667, "grad_norm": 7.26207758284529, "kl": 9.0859375, "learning_rate": 6.627083760031754e-07, "loss": 0.3635, "reward": 0.9375000298023224, "reward_std": 0.6717033982276917, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.7500000298023224, "step": 186 }, { "completion_length": 49.250000953674316, "epoch": 0.033244444444444445, "grad_norm": 6.343563025728324, "kl": 7.396484375, "learning_rate": 6.592975326466336e-07, "loss": 0.2961, "reward": 0.895833358168602, "reward_std": 0.393995963037014, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.8125000149011612, "step": 187 }, { "completion_length": 49.625000953674316, "epoch": 0.03342222222222222, "grad_norm": 2.659807025481146, "kl": 2.344482421875, "learning_rate": 6.558784187487494e-07, "loss": 0.0939, "reward": 1.2083333730697632, "reward_std": 0.5645233578979969, "rewards/equation_reward_func": 0.31250000558793545, "rewards/format_reward_func": 0.8958333507180214, "step": 188 }, { "completion_length": 52.562500953674316, "epoch": 0.0336, "grad_norm": 1.2466332143572052, "kl": 2.30419921875, "learning_rate": 6.524512118259121e-07, "loss": 0.0922, "reward": 1.0625000298023224, "reward_std": 0.29669977352023125, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.9375000149011612, "step": 189 }, { "completion_length": 51.291667461395264, "epoch": 0.033777777777777775, "grad_norm": 0.8974480191913635, "kl": 1.81787109375, "learning_rate": 6.490160898146918e-07, "loss": 0.0727, "reward": 1.1458333879709244, "reward_std": 0.3842546343803406, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.9583333432674408, "step": 190 }, { "completion_length": 51.75000190734863, "epoch": 0.03395555555555556, "grad_norm": 2.4757198804027007, "kl": 1.689208984375, "learning_rate": 6.455732310626004e-07, "loss": 0.0675, "reward": 1.0416667014360428, "reward_std": 0.47692746296525, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.8750000223517418, "step": 191 }, { "completion_length": 51.187500953674316, "epoch": 0.034133333333333335, "grad_norm": 1.3507594543460832, "kl": 1.50927734375, "learning_rate": 6.421228143188324e-07, "loss": 0.0604, "reward": 1.1458333730697632, "reward_std": 0.49200813844799995, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 0.916666679084301, "step": 192 }, { "completion_length": 51.687501430511475, "epoch": 0.03431111111111111, "grad_norm": 0.6767908253371119, "kl": 1.2314453125, "learning_rate": 6.386650187249843e-07, "loss": 0.0493, "reward": 1.0000000223517418, "reward_std": 0.23116153106093407, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.9375000074505806, "step": 193 }, { "completion_length": 50.14583444595337, "epoch": 0.03448888888888889, "grad_norm": 1.0489566647059154, "kl": 1.8955078125, "learning_rate": 6.352000238057539e-07, "loss": 0.0759, "reward": 0.9791666939854622, "reward_std": 0.5161184519529343, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.833333358168602, "step": 194 }, { "completion_length": 51.62500190734863, "epoch": 0.034666666666666665, "grad_norm": 1.6832436149314471, "kl": 2.7314453125, "learning_rate": 6.317280094596196e-07, "loss": 0.1092, "reward": 0.8541666939854622, "reward_std": 0.4447544738650322, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.7916666865348816, "step": 195 }, { "completion_length": 45.08333396911621, "epoch": 0.03484444444444444, "grad_norm": 4.004059770646148, "kl": 5.3125, "learning_rate": 6.282491559495004e-07, "loss": 0.2125, "reward": 0.9375000111758709, "reward_std": 0.45106470584869385, "rewards/equation_reward_func": 0.14583333395421505, "rewards/format_reward_func": 0.7916666828095913, "step": 196 }, { "completion_length": 43.437500953674316, "epoch": 0.035022222222222225, "grad_norm": 5.168360687547481, "kl": 6.869140625, "learning_rate": 6.247636438933962e-07, "loss": 0.2745, "reward": 0.791666679084301, "reward_std": 0.5768304541707039, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.7083333618938923, "step": 197 }, { "completion_length": 51.89583444595337, "epoch": 0.0352, "grad_norm": 1.3469737406143796, "kl": 4.0185546875, "learning_rate": 6.212716542550112e-07, "loss": 0.1607, "reward": 0.937500037252903, "reward_std": 0.49327604100108147, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.791666679084301, "step": 198 }, { "completion_length": 50.45833492279053, "epoch": 0.03537777777777778, "grad_norm": 1.0759183343173833, "kl": 1.4365234375, "learning_rate": 6.177733683343578e-07, "loss": 0.0576, "reward": 1.166666716337204, "reward_std": 0.5774685852229595, "rewards/equation_reward_func": 0.2500000074505806, "rewards/format_reward_func": 0.9166666865348816, "step": 199 }, { "completion_length": 52.16666793823242, "epoch": 0.035555555555555556, "grad_norm": 1.3378011133376908, "kl": 1.564453125, "learning_rate": 6.142689677583445e-07, "loss": 0.0626, "reward": 1.1250000298023224, "reward_std": 0.48728758841753006, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.9166666865348816, "step": 200 }, { "completion_length": 53.35416793823242, "epoch": 0.03573333333333333, "grad_norm": 3.0978024708495218, "kl": 1.87646484375, "learning_rate": 6.107586344713451e-07, "loss": 0.075, "reward": 1.1041666865348816, "reward_std": 0.4963582567870617, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.9166666865348816, "step": 201 }, { "completion_length": 54.60416793823242, "epoch": 0.03591111111111111, "grad_norm": 1.4803852973189526, "kl": 1.585693359375, "learning_rate": 6.072425507257527e-07, "loss": 0.0633, "reward": 1.041666679084301, "reward_std": 0.2958494834601879, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.9166666716337204, "step": 202 }, { "completion_length": 50.97916793823242, "epoch": 0.036088888888888886, "grad_norm": 1.0545514121261095, "kl": 3.23779296875, "learning_rate": 6.03720899072518e-07, "loss": 0.1296, "reward": 0.9791667014360428, "reward_std": 0.3412862755358219, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.8958333507180214, "step": 203 }, { "completion_length": 53.29166793823242, "epoch": 0.03626666666666667, "grad_norm": 1.258708711483678, "kl": 4.0927734375, "learning_rate": 6.001938623516705e-07, "loss": 0.1635, "reward": 0.895833358168602, "reward_std": 0.5528258420526981, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.7916666939854622, "step": 204 }, { "completion_length": 52.562501430511475, "epoch": 0.036444444444444446, "grad_norm": 1.192627874291934, "kl": 4.099365234375, "learning_rate": 5.966616236828262e-07, "loss": 0.1639, "reward": 0.8750000204890966, "reward_std": 0.5111859105527401, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.7291666772216558, "step": 205 }, { "completion_length": 51.97916841506958, "epoch": 0.03662222222222222, "grad_norm": 0.9273331385562189, "kl": 2.384033203125, "learning_rate": 5.931243664556802e-07, "loss": 0.0952, "reward": 1.0208333730697632, "reward_std": 0.40168892592191696, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8958333432674408, "step": 206 }, { "completion_length": 52.937500953674316, "epoch": 0.0368, "grad_norm": 1.8010297199927365, "kl": 3.5654296875, "learning_rate": 5.895822743204855e-07, "loss": 0.1426, "reward": 0.9583333730697632, "reward_std": 0.5215550065040588, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8125000223517418, "step": 207 }, { "completion_length": 51.60416841506958, "epoch": 0.036977777777777776, "grad_norm": 2.2135535055122597, "kl": 4.67578125, "learning_rate": 5.860355311785175e-07, "loss": 0.1869, "reward": 0.7916667014360428, "reward_std": 0.5062714368104935, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.7291666828095913, "step": 208 }, { "completion_length": 51.187501430511475, "epoch": 0.03715555555555555, "grad_norm": 1.4267067115373708, "kl": 5.53515625, "learning_rate": 5.824843211725264e-07, "loss": 0.221, "reward": 0.7916666939854622, "reward_std": 0.5978475920855999, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.6875000149011612, "step": 209 }, { "completion_length": 46.83333492279053, "epoch": 0.037333333333333336, "grad_norm": 2.8629336811508863, "kl": 5.829345703125, "learning_rate": 5.78928828677177e-07, "loss": 0.2335, "reward": 0.7916667014360428, "reward_std": 0.6519223563373089, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.6458333507180214, "step": 210 }, { "completion_length": 56.500001430511475, "epoch": 0.03751111111111111, "grad_norm": 4.250283297994686, "kl": 3.20849609375, "learning_rate": 5.753692382894759e-07, "loss": 0.1283, "reward": 0.8958333544433117, "reward_std": 0.6043893173336983, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.7083333469927311, "step": 211 }, { "completion_length": 54.375001430511475, "epoch": 0.03768888888888889, "grad_norm": 1.3837454584410787, "kl": 2.209228515625, "learning_rate": 5.718057348191874e-07, "loss": 0.0884, "reward": 0.8541666902601719, "reward_std": 0.6237337328493595, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.7083333544433117, "step": 212 }, { "completion_length": 51.89583444595337, "epoch": 0.037866666666666667, "grad_norm": 2.4493534707311713, "kl": 2.712158203125, "learning_rate": 5.682385032792385e-07, "loss": 0.1085, "reward": 0.7916666753590107, "reward_std": 0.5281161963939667, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.7083333544433117, "step": 213 }, { "completion_length": 55.89583492279053, "epoch": 0.03804444444444444, "grad_norm": 2.2208225510289012, "kl": 2.525634765625, "learning_rate": 5.646677288761132e-07, "loss": 0.101, "reward": 0.9791667014360428, "reward_std": 0.5173863507807255, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.8125000149011612, "step": 214 }, { "completion_length": 57.33333444595337, "epoch": 0.03822222222222222, "grad_norm": 0.8710411293209732, "kl": 2.434814453125, "learning_rate": 5.610935970002365e-07, "loss": 0.0974, "reward": 0.9375000223517418, "reward_std": 0.5438718348741531, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.7708333507180214, "step": 215 }, { "completion_length": 52.89583492279053, "epoch": 0.0384, "grad_norm": 3.567048205512723, "kl": 5.6318359375, "learning_rate": 5.575162932163501e-07, "loss": 0.2252, "reward": 0.7916666865348816, "reward_std": 0.6442962922155857, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.6666666865348816, "step": 216 }, { "completion_length": 47.85416793823242, "epoch": 0.03857777777777778, "grad_norm": 5.381576408790184, "kl": 7.646484375, "learning_rate": 5.53936003253877e-07, "loss": 0.3065, "reward": 0.6041666883975267, "reward_std": 0.5564196482300758, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.5625000167638063, "step": 217 }, { "completion_length": 57.354167461395264, "epoch": 0.03875555555555556, "grad_norm": 2.8527721738006755, "kl": 5.25146484375, "learning_rate": 5.503529129972792e-07, "loss": 0.2104, "reward": 0.8333333618938923, "reward_std": 0.5661325417459011, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.7083333469927311, "step": 218 }, { "completion_length": 52.77083444595337, "epoch": 0.038933333333333334, "grad_norm": 0.9510637073032893, "kl": 4.537109375, "learning_rate": 5.467672084764065e-07, "loss": 0.1813, "reward": 0.7083333469927311, "reward_std": 0.5277270041406155, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.6458333469927311, "step": 219 }, { "completion_length": 56.437501430511475, "epoch": 0.03911111111111111, "grad_norm": 1.4626021636984767, "kl": 3.047607421875, "learning_rate": 5.431790758568388e-07, "loss": 0.122, "reward": 0.7916666902601719, "reward_std": 0.5062714405357838, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.7083333618938923, "step": 220 }, { "completion_length": 52.29166793823242, "epoch": 0.03928888888888889, "grad_norm": 1.2504772253255731, "kl": 3.7666015625, "learning_rate": 5.395887014302191e-07, "loss": 0.1507, "reward": 0.8125000186264515, "reward_std": 0.4043150581419468, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.7708333544433117, "step": 221 }, { "completion_length": 58.39583492279053, "epoch": 0.039466666666666664, "grad_norm": 1.7464009649268544, "kl": 3.71337890625, "learning_rate": 5.359962716045835e-07, "loss": 0.1485, "reward": 0.7916666772216558, "reward_std": 0.5010788105428219, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.7083333563059568, "step": 222 }, { "completion_length": 61.41666841506958, "epoch": 0.03964444444444445, "grad_norm": 3.039553978693204, "kl": 1.073974609375, "learning_rate": 5.324019728946812e-07, "loss": 0.043, "reward": 0.854166679084301, "reward_std": 0.36124951019883156, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.8125000149011612, "step": 223 }, { "completion_length": 56.50000238418579, "epoch": 0.039822222222222224, "grad_norm": 2.0365403750402704, "kl": 1.76123046875, "learning_rate": 5.288059919122921e-07, "loss": 0.0705, "reward": 0.9583333656191826, "reward_std": 0.45534609258174896, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8541666865348816, "step": 224 }, { "completion_length": 61.14583492279053, "epoch": 0.04, "grad_norm": 2.811287393829535, "kl": 0.533447265625, "learning_rate": 5.252085153565374e-07, "loss": 0.0213, "reward": 1.1875000298023224, "reward_std": 0.37868379428982735, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 0.9583333432674408, "step": 225 }, { "completion_length": 59.437501430511475, "epoch": 0.04017777777777778, "grad_norm": 2.3086911193675315, "kl": 0.8291015625, "learning_rate": 5.216097300041869e-07, "loss": 0.0332, "reward": 1.0416667014360428, "reward_std": 0.3477308079600334, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.9166666865348816, "step": 226 }, { "completion_length": 62.20833492279053, "epoch": 0.040355555555555554, "grad_norm": 4.276021428219898, "kl": 0.727783203125, "learning_rate": 5.180098226999618e-07, "loss": 0.0291, "reward": 1.062500037252903, "reward_std": 0.4822668209671974, "rewards/equation_reward_func": 0.18750000186264515, "rewards/format_reward_func": 0.8750000149011612, "step": 227 }, { "completion_length": 65.10416889190674, "epoch": 0.04053333333333333, "grad_norm": 1.2764419644302698, "kl": 0.894775390625, "learning_rate": 5.144089803468332e-07, "loss": 0.0358, "reward": 1.1250000447034836, "reward_std": 0.4984116405248642, "rewards/equation_reward_func": 0.2083333395421505, "rewards/format_reward_func": 0.9166666865348816, "step": 228 }, { "completion_length": 55.812501430511475, "epoch": 0.040711111111111115, "grad_norm": 1.717756013089439, "kl": 1.95361328125, "learning_rate": 5.108073898963193e-07, "loss": 0.0781, "reward": 0.9583333656191826, "reward_std": 0.3477308079600334, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8958333507180214, "step": 229 }, { "completion_length": 57.02083444595337, "epoch": 0.04088888888888889, "grad_norm": 1.9105584928374175, "kl": 3.0869140625, "learning_rate": 5.072052383387786e-07, "loss": 0.1236, "reward": 0.9375000260770321, "reward_std": 0.37575671821832657, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8333333469927311, "step": 230 }, { "completion_length": 53.437501430511475, "epoch": 0.04106666666666667, "grad_norm": 3.2606237368453375, "kl": 6.6767578125, "learning_rate": 5.036027126937013e-07, "loss": 0.2667, "reward": 0.7708333656191826, "reward_std": 0.5834501683712006, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.6458333469927311, "step": 231 }, { "completion_length": 57.43750190734863, "epoch": 0.041244444444444445, "grad_norm": 4.160342344463272, "kl": 5.7734375, "learning_rate": 5e-07, "loss": 0.2307, "reward": 0.7916666939854622, "reward_std": 0.550732146948576, "rewards/equation_reward_func": 0.08333333395421505, "rewards/format_reward_func": 0.708333358168602, "step": 232 }, { "completion_length": 50.937501430511475, "epoch": 0.04142222222222222, "grad_norm": 1.7460050228805684, "kl": 5.1337890625, "learning_rate": 4.963972873062987e-07, "loss": 0.2054, "reward": 0.8958333618938923, "reward_std": 0.609845332801342, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.7083333507180214, "step": 233 }, { "completion_length": 59.77083396911621, "epoch": 0.0416, "grad_norm": 0.9068735838301717, "kl": 2.849609375, "learning_rate": 4.927947616612215e-07, "loss": 0.1138, "reward": 0.958333358168602, "reward_std": 0.36931218579411507, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.8750000149011612, "step": 234 }, { "completion_length": 57.58333492279053, "epoch": 0.041777777777777775, "grad_norm": 2.6421618291419535, "kl": 2.1416015625, "learning_rate": 4.891926101036806e-07, "loss": 0.0856, "reward": 0.875000037252903, "reward_std": 0.45271996036171913, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8125000223517418, "step": 235 }, { "completion_length": 61.14583492279053, "epoch": 0.04195555555555556, "grad_norm": 2.420138039635954, "kl": 1.884765625, "learning_rate": 4.855910196531669e-07, "loss": 0.0753, "reward": 0.8541667014360428, "reward_std": 0.5093041993677616, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.770833358168602, "step": 236 }, { "completion_length": 57.625000953674316, "epoch": 0.042133333333333335, "grad_norm": 2.23529671294081, "kl": 1.854248046875, "learning_rate": 4.819901773000383e-07, "loss": 0.074, "reward": 0.9791667088866234, "reward_std": 0.5189073011279106, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8333333507180214, "step": 237 }, { "completion_length": 52.47916793823242, "epoch": 0.04231111111111111, "grad_norm": 1.6361655131027242, "kl": 5.17529296875, "learning_rate": 4.783902699958129e-07, "loss": 0.2068, "reward": 0.8125000298023224, "reward_std": 0.5698040388524532, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.7083333507180214, "step": 238 }, { "completion_length": 49.375001430511475, "epoch": 0.04248888888888889, "grad_norm": 1.222312539805785, "kl": 5.5263671875, "learning_rate": 4.747914846434627e-07, "loss": 0.2211, "reward": 0.9166667088866234, "reward_std": 0.6387733817100525, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.7500000298023224, "step": 239 }, { "completion_length": 51.750001430511475, "epoch": 0.042666666666666665, "grad_norm": 1.9824590980545862, "kl": 4.46875, "learning_rate": 4.711940080877079e-07, "loss": 0.1786, "reward": 1.0208333767950535, "reward_std": 0.5363415889441967, "rewards/equation_reward_func": 0.22916667349636555, "rewards/format_reward_func": 0.7916666753590107, "step": 240 }, { "completion_length": 54.687500953674316, "epoch": 0.04284444444444444, "grad_norm": 1.6661289610091754, "kl": 4.4775390625, "learning_rate": 4.675980271053187e-07, "loss": 0.1791, "reward": 0.8750000298023224, "reward_std": 0.4297148250043392, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8125000149011612, "step": 241 }, { "completion_length": 50.437501430511475, "epoch": 0.043022222222222226, "grad_norm": 3.5074096399017862, "kl": 6.017578125, "learning_rate": 4.6400372839541647e-07, "loss": 0.2406, "reward": 0.8541666828095913, "reward_std": 0.5376026295125484, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.7291666828095913, "step": 242 }, { "completion_length": 52.10416793823242, "epoch": 0.0432, "grad_norm": 3.137739294752723, "kl": 4.3701171875, "learning_rate": 4.6041129856978083e-07, "loss": 0.1746, "reward": 0.8958333656191826, "reward_std": 0.47406983748078346, "rewards/equation_reward_func": 0.1041666679084301, "rewards/format_reward_func": 0.791666679084301, "step": 243 }, { "completion_length": 56.16666793823242, "epoch": 0.04337777777777778, "grad_norm": 1.638039054641607, "kl": 2.74462890625, "learning_rate": 4.568209241431614e-07, "loss": 0.11, "reward": 0.9375000447034836, "reward_std": 0.3842546418309212, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8750000149011612, "step": 244 }, { "completion_length": 59.791667461395264, "epoch": 0.043555555555555556, "grad_norm": 1.4835502515862988, "kl": 2.812744140625, "learning_rate": 4.532327915235935e-07, "loss": 0.1124, "reward": 1.0416666939854622, "reward_std": 0.49689069390296936, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.8750000223517418, "step": 245 }, { "completion_length": 58.87500238418579, "epoch": 0.04373333333333333, "grad_norm": 1.2125553729452667, "kl": 1.87158203125, "learning_rate": 4.4964708700272086e-07, "loss": 0.0746, "reward": 1.0000000298023224, "reward_std": 0.3747682049870491, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8958333432674408, "step": 246 }, { "completion_length": 59.83333444595337, "epoch": 0.04391111111111111, "grad_norm": 0.8400742214946505, "kl": 1.407470703125, "learning_rate": 4.4606399674612306e-07, "loss": 0.0563, "reward": 1.1041666865348816, "reward_std": 0.2591874338686466, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.9583333432674408, "step": 247 }, { "completion_length": 56.31250238418579, "epoch": 0.044088888888888886, "grad_norm": 1.5894946494651951, "kl": 2.568603515625, "learning_rate": 4.424837067836499e-07, "loss": 0.1027, "reward": 1.1458333507180214, "reward_std": 0.41730131581425667, "rewards/equation_reward_func": 0.2500000037252903, "rewards/format_reward_func": 0.8958333507180214, "step": 248 }, { "completion_length": 57.97916793823242, "epoch": 0.04426666666666667, "grad_norm": 1.5682356442872543, "kl": 0.796142578125, "learning_rate": 4.389064029997634e-07, "loss": 0.0319, "reward": 1.208333358168602, "reward_std": 0.4256649389863014, "rewards/equation_reward_func": 0.2500000037252903, "rewards/format_reward_func": 0.9583333432674408, "step": 249 }, { "completion_length": 53.187501430511475, "epoch": 0.044444444444444446, "grad_norm": 1.616534745852269, "kl": 2.35107421875, "learning_rate": 4.353322711238869e-07, "loss": 0.094, "reward": 1.0833333805203438, "reward_std": 0.5344030410051346, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.8958333507180214, "step": 250 }, { "completion_length": 56.437501430511475, "epoch": 0.04462222222222222, "grad_norm": 0.9960822449051799, "kl": 2.34619140625, "learning_rate": 4.3176149672076143e-07, "loss": 0.0939, "reward": 1.0000000149011612, "reward_std": 0.3006153665482998, "rewards/equation_reward_func": 0.08333333395421505, "rewards/format_reward_func": 0.9166666865348816, "step": 251 }, { "completion_length": 58.354167461395264, "epoch": 0.0448, "grad_norm": 1.4205394021526243, "kl": 2.93505859375, "learning_rate": 4.2819426518081256e-07, "loss": 0.1178, "reward": 1.1458333656191826, "reward_std": 0.5342973358929157, "rewards/equation_reward_func": 0.2708333395421505, "rewards/format_reward_func": 0.8750000223517418, "step": 252 }, { "completion_length": 52.416667461395264, "epoch": 0.044977777777777776, "grad_norm": 4.465478199693993, "kl": 4.697509765625, "learning_rate": 4.246307617105241e-07, "loss": 0.1876, "reward": 1.0625000223517418, "reward_std": 0.5653560161590576, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.8541666865348816, "step": 253 }, { "completion_length": 53.77083396911621, "epoch": 0.04515555555555555, "grad_norm": 2.9278233844283457, "kl": 5.1484375, "learning_rate": 4.21071171322823e-07, "loss": 0.206, "reward": 0.9583333693444729, "reward_std": 0.5784117728471756, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.7708333544433117, "step": 254 }, { "completion_length": 46.81250071525574, "epoch": 0.04533333333333334, "grad_norm": 3.0060965561056654, "kl": 6.7744140625, "learning_rate": 4.1751567882747373e-07, "loss": 0.2713, "reward": 1.0000000298023224, "reward_std": 0.6698537915945053, "rewards/equation_reward_func": 0.25000000186264515, "rewards/format_reward_func": 0.7500000223517418, "step": 255 }, { "completion_length": 52.812501430511475, "epoch": 0.04551111111111111, "grad_norm": 2.435192469284996, "kl": 3.97900390625, "learning_rate": 4.139644688214826e-07, "loss": 0.1592, "reward": 0.9583333730697632, "reward_std": 0.44083887711167336, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8541666865348816, "step": 256 }, { "completion_length": 54.500000953674316, "epoch": 0.04568888888888889, "grad_norm": 2.3477327869102136, "kl": 4.3671875, "learning_rate": 4.104177256795144e-07, "loss": 0.1747, "reward": 1.1250000298023224, "reward_std": 0.5271082073450089, "rewards/equation_reward_func": 0.25000000558793545, "rewards/format_reward_func": 0.8750000149011612, "step": 257 }, { "completion_length": 55.562501430511475, "epoch": 0.04586666666666667, "grad_norm": 1.462739785577378, "kl": 1.19873046875, "learning_rate": 4.068756335443198e-07, "loss": 0.0481, "reward": 1.083333358168602, "reward_std": 0.42273785918951035, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.9166666865348816, "step": 258 }, { "completion_length": 54.10416841506958, "epoch": 0.04604444444444444, "grad_norm": 1.0194230408797653, "kl": 1.746826171875, "learning_rate": 4.0333837631717376e-07, "loss": 0.07, "reward": 1.1250000447034836, "reward_std": 0.4783512204885483, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.9166666865348816, "step": 259 }, { "completion_length": 54.27083444595337, "epoch": 0.04622222222222222, "grad_norm": 1.2154097528831294, "kl": 2.1083984375, "learning_rate": 3.998061376483297e-07, "loss": 0.0844, "reward": 1.0833333730697632, "reward_std": 0.44083888456225395, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.8958333507180214, "step": 260 }, { "completion_length": 52.75000190734863, "epoch": 0.0464, "grad_norm": 0.8928800361092635, "kl": 2.84619140625, "learning_rate": 3.9627910092748204e-07, "loss": 0.1137, "reward": 0.8750000074505806, "reward_std": 0.3061685785651207, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.8333333432674408, "step": 261 }, { "completion_length": 48.687500953674316, "epoch": 0.04657777777777778, "grad_norm": 2.0850690944306476, "kl": 4.7197265625, "learning_rate": 3.9275744927424723e-07, "loss": 0.1885, "reward": 0.958333358168602, "reward_std": 0.5193834900856018, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.7708333544433117, "step": 262 }, { "completion_length": 48.187501430511475, "epoch": 0.04675555555555556, "grad_norm": 1.408092281724432, "kl": 4.294189453125, "learning_rate": 3.89241365528655e-07, "loss": 0.1719, "reward": 0.8333333507180214, "reward_std": 0.5240839421749115, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.708333358168602, "step": 263 }, { "completion_length": 51.64583396911621, "epoch": 0.046933333333333334, "grad_norm": 1.9193528605234629, "kl": 3.56005859375, "learning_rate": 3.8573103224165547e-07, "loss": 0.1424, "reward": 0.8333333656191826, "reward_std": 0.6091980896890163, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.7083333507180214, "step": 264 }, { "completion_length": 51.37500047683716, "epoch": 0.04711111111111111, "grad_norm": 1.8731610861564136, "kl": 3.638671875, "learning_rate": 3.8222663166564207e-07, "loss": 0.1455, "reward": 0.8125000298023224, "reward_std": 0.6027945913374424, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.6875000149011612, "step": 265 }, { "completion_length": 58.45833492279053, "epoch": 0.04728888888888889, "grad_norm": 1.1952492746934869, "kl": 1.924560546875, "learning_rate": 3.787283457449889e-07, "loss": 0.0769, "reward": 1.1666666939854622, "reward_std": 0.48326630517840385, "rewards/equation_reward_func": 0.25000000558793545, "rewards/format_reward_func": 0.916666679084301, "step": 266 }, { "completion_length": 45.64583444595337, "epoch": 0.047466666666666664, "grad_norm": 3.8058624476316156, "kl": 7.12890625, "learning_rate": 3.752363561066039e-07, "loss": 0.285, "reward": 0.729166692122817, "reward_std": 0.5802789963781834, "rewards/equation_reward_func": 0.1041666679084301, "rewards/format_reward_func": 0.6250000204890966, "step": 267 }, { "completion_length": 52.70833492279053, "epoch": 0.04764444444444445, "grad_norm": 1.2974618772894446, "kl": 3.54248046875, "learning_rate": 3.717508440504997e-07, "loss": 0.1417, "reward": 0.8750000298023224, "reward_std": 0.37628914788365364, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.833333358168602, "step": 268 }, { "completion_length": 51.687500953674316, "epoch": 0.047822222222222224, "grad_norm": 1.8625385243751729, "kl": 3.452392578125, "learning_rate": 3.6827199054038036e-07, "loss": 0.1382, "reward": 0.9791666865348816, "reward_std": 0.5766182914376259, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.7916666865348816, "step": 269 }, { "completion_length": 50.500000953674316, "epoch": 0.048, "grad_norm": 1.3683659232487602, "kl": 3.94140625, "learning_rate": 3.64799976194246e-07, "loss": 0.1577, "reward": 0.895833358168602, "reward_std": 0.4043150581419468, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.833333358168602, "step": 270 }, { "completion_length": 53.187501430511475, "epoch": 0.04817777777777778, "grad_norm": 1.7916494340378104, "kl": 4.187255859375, "learning_rate": 3.613349812750158e-07, "loss": 0.1672, "reward": 0.9375000223517418, "reward_std": 0.5739921554923058, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.7708333507180214, "step": 271 }, { "completion_length": 49.10416793823242, "epoch": 0.048355555555555554, "grad_norm": 1.9583453292652802, "kl": 2.42236328125, "learning_rate": 3.5787718568116757e-07, "loss": 0.0969, "reward": 0.895833358168602, "reward_std": 0.425799235701561, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.833333358168602, "step": 272 }, { "completion_length": 40.875001430511475, "epoch": 0.04853333333333333, "grad_norm": 2.7960145232227065, "kl": 5.267578125, "learning_rate": 3.544267689373995e-07, "loss": 0.2103, "reward": 0.9375000223517418, "reward_std": 0.6141306385397911, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.7500000223517418, "step": 273 }, { "completion_length": 52.95833444595337, "epoch": 0.04871111111111111, "grad_norm": 1.182957198324338, "kl": 1.4658203125, "learning_rate": 3.5098391018530813e-07, "loss": 0.0587, "reward": 1.1458333730697632, "reward_std": 0.48782002180814743, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 0.916666679084301, "step": 274 }, { "completion_length": 47.39583444595337, "epoch": 0.04888888888888889, "grad_norm": 3.75146574426156, "kl": 5.2431640625, "learning_rate": 3.4754878817408783e-07, "loss": 0.2094, "reward": 0.9166666939854622, "reward_std": 0.5990909859538078, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.770833358168602, "step": 275 }, { "completion_length": 55.395835399627686, "epoch": 0.04906666666666667, "grad_norm": 0.6517175065786031, "kl": 1.81494140625, "learning_rate": 3.4412158125125073e-07, "loss": 0.0726, "reward": 1.1041666939854622, "reward_std": 0.36417657881975174, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.9375000074505806, "step": 276 }, { "completion_length": 51.45833396911621, "epoch": 0.049244444444444445, "grad_norm": 1.1960838188628722, "kl": 2.79052734375, "learning_rate": 3.4070246735336645e-07, "loss": 0.1116, "reward": 0.9583333656191826, "reward_std": 0.4338619150221348, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8541666865348816, "step": 277 }, { "completion_length": 51.39583492279053, "epoch": 0.04942222222222222, "grad_norm": 1.8424306070085834, "kl": 2.34814453125, "learning_rate": 3.372916239968245e-07, "loss": 0.094, "reward": 0.9791666939854622, "reward_std": 0.28219256550073624, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.916666679084301, "step": 278 }, { "completion_length": 54.187500953674316, "epoch": 0.0496, "grad_norm": 1.1766152581682274, "kl": 0.583251953125, "learning_rate": 3.3388922826861785e-07, "loss": 0.0233, "reward": 1.1875000298023224, "reward_std": 0.40168892219662666, "rewards/equation_reward_func": 0.22916666977107525, "rewards/format_reward_func": 0.9583333432674408, "step": 279 }, { "completion_length": 54.66666793823242, "epoch": 0.049777777777777775, "grad_norm": 1.8840677531860919, "kl": 0.906982421875, "learning_rate": 3.3049545681714775e-07, "loss": 0.0363, "reward": 1.1041667014360428, "reward_std": 0.33713918924331665, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.9583333432674408, "step": 280 }, { "completion_length": 55.312500953674316, "epoch": 0.04995555555555556, "grad_norm": 1.2002604740905347, "kl": 0.96142578125, "learning_rate": 3.271104858430537e-07, "loss": 0.0384, "reward": 1.1666667014360428, "reward_std": 0.47278038039803505, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 0.9375000149011612, "step": 281 }, { "completion_length": 56.79166841506958, "epoch": 0.050133333333333335, "grad_norm": 0.6508397200310873, "kl": 0.695556640625, "learning_rate": 3.2373449109006474e-07, "loss": 0.0278, "reward": 1.2291667014360428, "reward_std": 0.299626849591732, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 1.0, "step": 282 }, { "completion_length": 55.833335399627686, "epoch": 0.05031111111111111, "grad_norm": 0.8421705085055283, "kl": 0.91455078125, "learning_rate": 3.2036764783587444e-07, "loss": 0.0366, "reward": 1.0833333730697632, "reward_std": 0.3589930906891823, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.9375000074505806, "step": 283 }, { "completion_length": 54.187501430511475, "epoch": 0.05048888888888889, "grad_norm": 0.852398579960793, "kl": 1.685791015625, "learning_rate": 3.1701013088304206e-07, "loss": 0.0673, "reward": 1.0208333507180214, "reward_std": 0.43918363004922867, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.854166679084301, "step": 284 }, { "completion_length": 51.70833444595337, "epoch": 0.050666666666666665, "grad_norm": 2.0270999635419984, "kl": 2.473388671875, "learning_rate": 3.1366211454991556e-07, "loss": 0.0991, "reward": 0.958333358168602, "reward_std": 0.41783374920487404, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8541666865348816, "step": 285 }, { "completion_length": 53.750001430511475, "epoch": 0.05084444444444444, "grad_norm": 2.1019015776731607, "kl": 1.8466796875, "learning_rate": 3.1032377266158214e-07, "loss": 0.0738, "reward": 1.0000000298023224, "reward_std": 0.4847872592508793, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.833333358168602, "step": 286 }, { "completion_length": 54.64583492279053, "epoch": 0.05102222222222222, "grad_norm": 2.0006844871954352, "kl": 2.72509765625, "learning_rate": 3.0699527854084335e-07, "loss": 0.109, "reward": 0.9791667014360428, "reward_std": 0.47457385435700417, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8541666865348816, "step": 287 }, { "completion_length": 56.270835399627686, "epoch": 0.0512, "grad_norm": 0.9939066440838259, "kl": 2.009765625, "learning_rate": 3.036768049992157e-07, "loss": 0.0803, "reward": 1.062500037252903, "reward_std": 0.41619613766670227, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.916666679084301, "step": 288 }, { "completion_length": 52.83333492279053, "epoch": 0.05137777777777778, "grad_norm": 1.3462986622451496, "kl": 2.93408203125, "learning_rate": 3.003685243279592e-07, "loss": 0.1172, "reward": 1.0625000298023224, "reward_std": 0.5250724591314793, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.8750000223517418, "step": 289 }, { "completion_length": 54.416667461395264, "epoch": 0.051555555555555556, "grad_norm": 1.2162218711888282, "kl": 2.576171875, "learning_rate": 2.9707060828913224e-07, "loss": 0.1031, "reward": 1.145833358168602, "reward_std": 0.495253074914217, "rewards/equation_reward_func": 0.2500000037252903, "rewards/format_reward_func": 0.8958333507180214, "step": 290 }, { "completion_length": 50.77083396911621, "epoch": 0.05173333333333333, "grad_norm": 3.4755720807784884, "kl": 5.18798828125, "learning_rate": 2.9378322810667304e-07, "loss": 0.2078, "reward": 1.0625000298023224, "reward_std": 0.6527481563389301, "rewards/equation_reward_func": 0.22916667349636555, "rewards/format_reward_func": 0.833333358168602, "step": 291 }, { "completion_length": 52.39583444595337, "epoch": 0.05191111111111111, "grad_norm": 2.583945749010597, "kl": 5.158203125, "learning_rate": 2.9050655445751137e-07, "loss": 0.2066, "reward": 0.9791666865348816, "reward_std": 0.6126096844673157, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.7916666939854622, "step": 292 }, { "completion_length": 53.70833444595337, "epoch": 0.052088888888888886, "grad_norm": 2.4978389077752206, "kl": 4.4462890625, "learning_rate": 2.872407574627051e-07, "loss": 0.1783, "reward": 1.1041666865348816, "reward_std": 0.4737689010798931, "rewards/equation_reward_func": 0.2083333358168602, "rewards/format_reward_func": 0.8958333507180214, "step": 293 }, { "completion_length": 53.89583444595337, "epoch": 0.05226666666666667, "grad_norm": 4.46081810506806, "kl": 6.01220703125, "learning_rate": 2.839860066786103e-07, "loss": 0.2403, "reward": 0.937500037252903, "reward_std": 0.5994852036237717, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.7708333507180214, "step": 294 }, { "completion_length": 55.66666841506958, "epoch": 0.052444444444444446, "grad_norm": 1.7753949998719913, "kl": 4.2880859375, "learning_rate": 2.807424710880756e-07, "loss": 0.1712, "reward": 1.0000000149011612, "reward_std": 0.6212911605834961, "rewards/equation_reward_func": 0.2083333358168602, "rewards/format_reward_func": 0.7916666939854622, "step": 295 }, { "completion_length": 49.72916793823242, "epoch": 0.05262222222222222, "grad_norm": 3.7499389172897573, "kl": 5.09375, "learning_rate": 2.7751031909167045e-07, "loss": 0.204, "reward": 0.8750000298023224, "reward_std": 0.49983540177345276, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.7708333432674408, "step": 296 }, { "completion_length": 53.10416793823242, "epoch": 0.0528, "grad_norm": 1.3578432580665998, "kl": 2.974609375, "learning_rate": 2.742897184989414e-07, "loss": 0.1191, "reward": 1.0625000521540642, "reward_std": 0.4058360084891319, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.916666679084301, "step": 297 }, { "completion_length": 57.458335399627686, "epoch": 0.052977777777777776, "grad_norm": 1.2056020602520727, "kl": 1.7373046875, "learning_rate": 2.710808365197e-07, "loss": 0.0695, "reward": 1.2083333730697632, "reward_std": 0.49666832759976387, "rewards/equation_reward_func": 0.27083333767950535, "rewards/format_reward_func": 0.9375000149011612, "step": 298 }, { "completion_length": 55.02083396911621, "epoch": 0.05315555555555555, "grad_norm": 1.2324543500459841, "kl": 2.1708984375, "learning_rate": 2.6788383975533993e-07, "loss": 0.0867, "reward": 1.0208333805203438, "reward_std": 0.4488043636083603, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8958333507180214, "step": 299 }, { "completion_length": 55.08333492279053, "epoch": 0.05333333333333334, "grad_norm": 0.6124221980932599, "kl": 1.42626953125, "learning_rate": 2.646988941901898e-07, "loss": 0.057, "reward": 1.1041666865348816, "reward_std": 0.2350771240890026, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.9791666716337204, "step": 300 }, { "completion_length": 56.312501430511475, "epoch": 0.05351111111111111, "grad_norm": 0.4101111309943839, "kl": 1.00390625, "learning_rate": 2.6152616518289305e-07, "loss": 0.0401, "reward": 1.1458333730697632, "reward_std": 0.26070838421583176, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.9791666716337204, "step": 301 }, { "completion_length": 56.9166693687439, "epoch": 0.05368888888888889, "grad_norm": 2.0632242124320124, "kl": 0.875, "learning_rate": 2.583658174578247e-07, "loss": 0.035, "reward": 1.1041667014360428, "reward_std": 0.28219256177544594, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.9791666716337204, "step": 302 }, { "completion_length": 56.04166793823242, "epoch": 0.05386666666666667, "grad_norm": 3.089941069383616, "kl": 0.99755859375, "learning_rate": 2.5521801509653717e-07, "loss": 0.0399, "reward": 1.1875000447034836, "reward_std": 0.4592616818845272, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 0.9583333432674408, "step": 303 }, { "completion_length": 53.312501430511475, "epoch": 0.054044444444444444, "grad_norm": 1.232358463835666, "kl": 2.2294921875, "learning_rate": 2.520829215292426e-07, "loss": 0.0892, "reward": 1.2291667088866234, "reward_std": 0.471791859716177, "rewards/equation_reward_func": 0.29166667349636555, "rewards/format_reward_func": 0.9375000074505806, "step": 304 }, { "completion_length": 52.687501430511475, "epoch": 0.05422222222222222, "grad_norm": 1.1619189366071008, "kl": 3.21044921875, "learning_rate": 2.4896069952632787e-07, "loss": 0.1283, "reward": 1.0208333730697632, "reward_std": 0.5709268562495708, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.8125000223517418, "step": 305 }, { "completion_length": 58.81250190734863, "epoch": 0.0544, "grad_norm": 1.1697787944882119, "kl": 1.681640625, "learning_rate": 2.4585151118990285e-07, "loss": 0.0673, "reward": 1.1666666865348816, "reward_std": 0.4442220404744148, "rewards/equation_reward_func": 0.22916666977107525, "rewards/format_reward_func": 0.9375000149011612, "step": 306 }, { "completion_length": 53.22916841506958, "epoch": 0.05457777777777778, "grad_norm": 1.701139042149517, "kl": 3.260986328125, "learning_rate": 2.427555179453844e-07, "loss": 0.1302, "reward": 0.9583333656191826, "reward_std": 0.46985330432653427, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8541666939854622, "step": 307 }, { "completion_length": 55.81250190734863, "epoch": 0.05475555555555556, "grad_norm": 1.1582168355268272, "kl": 2.69091796875, "learning_rate": 2.396728805331167e-07, "loss": 0.1079, "reward": 1.0625000074505806, "reward_std": 0.5002285167574883, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.854166679084301, "step": 308 }, { "completion_length": 50.500001430511475, "epoch": 0.054933333333333334, "grad_norm": 1.666774374427531, "kl": 3.728515625, "learning_rate": 2.366037590000236e-07, "loss": 0.1491, "reward": 0.8750000298023224, "reward_std": 0.46232305839657784, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.7916666939854622, "step": 309 }, { "completion_length": 56.000000953674316, "epoch": 0.05511111111111111, "grad_norm": 0.9155411365595406, "kl": 2.46533203125, "learning_rate": 2.3354831269130132e-07, "loss": 0.0986, "reward": 1.1250000298023224, "reward_std": 0.49133748933672905, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.9166666865348816, "step": 310 }, { "completion_length": 55.83333444595337, "epoch": 0.05528888888888889, "grad_norm": 1.0525255657485806, "kl": 3.048828125, "learning_rate": 2.3050670024214375e-07, "loss": 0.1218, "reward": 1.0000000447034836, "reward_std": 0.5053886137902737, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8541666865348816, "step": 311 }, { "completion_length": 52.08333492279053, "epoch": 0.055466666666666664, "grad_norm": 2.6485128650154963, "kl": 3.696044921875, "learning_rate": 2.2747907956950707e-07, "loss": 0.1479, "reward": 0.9583333656191826, "reward_std": 0.39079636707901955, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.8750000223517418, "step": 312 }, { "completion_length": 58.85416793823242, "epoch": 0.05564444444444445, "grad_norm": 1.7377493985395709, "kl": 3.342041015625, "learning_rate": 2.2446560786391132e-07, "loss": 0.1339, "reward": 0.9791666939854622, "reward_std": 0.4161961302161217, "rewards/equation_reward_func": 0.1041666679084301, "rewards/format_reward_func": 0.8750000223517418, "step": 313 }, { "completion_length": 57.10416841506958, "epoch": 0.055822222222222224, "grad_norm": 0.9728311568166134, "kl": 2.14892578125, "learning_rate": 2.2146644158127826e-07, "loss": 0.0859, "reward": 1.0416666939854622, "reward_std": 0.3602609857916832, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.916666679084301, "step": 314 }, { "completion_length": 52.83333444595337, "epoch": 0.056, "grad_norm": 0.8189600503332306, "kl": 2.77734375, "learning_rate": 2.1848173643480873e-07, "loss": 0.1111, "reward": 1.0208333507180214, "reward_std": 0.3170611411333084, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.8750000074505806, "step": 315 }, { "completion_length": 51.187501430511475, "epoch": 0.05617777777777778, "grad_norm": 1.4570844733731012, "kl": 3.875, "learning_rate": 2.1551164738689892e-07, "loss": 0.1549, "reward": 0.8958333656191826, "reward_std": 0.5094013959169388, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.7708333507180214, "step": 316 }, { "completion_length": 58.89583492279053, "epoch": 0.056355555555555555, "grad_norm": 1.3079415239293524, "kl": 1.58935546875, "learning_rate": 2.1255632864109379e-07, "loss": 0.0637, "reward": 1.1041667014360428, "reward_std": 0.34674229100346565, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.9583333432674408, "step": 317 }, { "completion_length": 56.93750190734863, "epoch": 0.05653333333333333, "grad_norm": 0.9195869282916206, "kl": 2.50048828125, "learning_rate": 2.0961593363408154e-07, "loss": 0.0999, "reward": 0.9583333656191826, "reward_std": 0.42678775265812874, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.8750000223517418, "step": 318 }, { "completion_length": 62.66666889190674, "epoch": 0.05671111111111111, "grad_norm": 1.4378742619983675, "kl": 2.0732421875, "learning_rate": 2.0669061502772772e-07, "loss": 0.083, "reward": 0.9583333507180214, "reward_std": 0.3102184720337391, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8958333507180214, "step": 319 }, { "completion_length": 58.520835399627686, "epoch": 0.05688888888888889, "grad_norm": 2.5114180738618748, "kl": 2.60693359375, "learning_rate": 2.037805247011482e-07, "loss": 0.1042, "reward": 0.9375000447034836, "reward_std": 0.3842546418309212, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8750000149011612, "step": 320 }, { "completion_length": 60.437501430511475, "epoch": 0.05706666666666667, "grad_norm": 3.344751006657643, "kl": 1.382568359375, "learning_rate": 2.008858137428251e-07, "loss": 0.0552, "reward": 1.0625000596046448, "reward_std": 0.4217669852077961, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.9375000149011612, "step": 321 }, { "completion_length": 59.16666793823242, "epoch": 0.057244444444444445, "grad_norm": 2.980207110263902, "kl": 2.493896484375, "learning_rate": 1.9800663244276127e-07, "loss": 0.0999, "reward": 1.1458333805203438, "reward_std": 0.5278613045811653, "rewards/equation_reward_func": 0.2500000074505806, "rewards/format_reward_func": 0.8958333432674408, "step": 322 }, { "completion_length": 60.35416841506958, "epoch": 0.05742222222222222, "grad_norm": 0.9630989177648202, "kl": 1.7958984375, "learning_rate": 1.9514313028467783e-07, "loss": 0.072, "reward": 0.9375000223517418, "reward_std": 0.3468805216252804, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.8541666716337204, "step": 323 }, { "completion_length": 58.10416793823242, "epoch": 0.0576, "grad_norm": 1.144547736944523, "kl": 2.6875, "learning_rate": 1.9229545593825363e-07, "loss": 0.1075, "reward": 0.8958333656191826, "reward_std": 0.4418274015188217, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.833333358168602, "step": 324 }, { "completion_length": 58.562501430511475, "epoch": 0.057777777777777775, "grad_norm": 2.5145859774979207, "kl": 3.255615234375, "learning_rate": 1.8946375725140578e-07, "loss": 0.13, "reward": 0.9166666865348816, "reward_std": 0.3747681975364685, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8541666865348816, "step": 325 }, { "completion_length": 61.66666841506958, "epoch": 0.05795555555555556, "grad_norm": 1.9889895034082692, "kl": 2.33544921875, "learning_rate": 1.8664818124261373e-07, "loss": 0.0936, "reward": 1.062500037252903, "reward_std": 0.3898078463971615, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.8958333507180214, "step": 326 }, { "completion_length": 61.18750238418579, "epoch": 0.058133333333333335, "grad_norm": 1.3792448777248967, "kl": 0.93701171875, "learning_rate": 1.8384887409328688e-07, "loss": 0.0375, "reward": 1.1250000447034836, "reward_std": 0.3477308116853237, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.9791666716337204, "step": 327 }, { "completion_length": 61.10416889190674, "epoch": 0.05831111111111111, "grad_norm": 0.7306205857498139, "kl": 2.73828125, "learning_rate": 1.8106598114017397e-07, "loss": 0.1093, "reward": 1.000000037252903, "reward_std": 0.4450269974768162, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8541666865348816, "step": 328 }, { "completion_length": 61.37500190734863, "epoch": 0.05848888888888889, "grad_norm": 1.3174084946886926, "kl": 2.44970703125, "learning_rate": 1.782996468678179e-07, "loss": 0.098, "reward": 1.000000037252903, "reward_std": 0.37628915533423424, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8958333507180214, "step": 329 }, { "completion_length": 58.125001430511475, "epoch": 0.058666666666666666, "grad_norm": 2.0400973712700896, "kl": 2.13037109375, "learning_rate": 1.7555001490105486e-07, "loss": 0.0853, "reward": 1.0416666939854622, "reward_std": 0.41228054463863373, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.8958333507180214, "step": 330 }, { "completion_length": 57.95833492279053, "epoch": 0.05884444444444444, "grad_norm": 1.5514755499900446, "kl": 2.143798828125, "learning_rate": 1.728172279975561e-07, "loss": 0.0858, "reward": 1.1875000596046448, "reward_std": 0.5512077212333679, "rewards/equation_reward_func": 0.2916666753590107, "rewards/format_reward_func": 0.8958333507180214, "step": 331 }, { "completion_length": 60.08333492279053, "epoch": 0.05902222222222222, "grad_norm": 1.3729341373528947, "kl": 1.197998046875, "learning_rate": 1.7010142804041783e-07, "loss": 0.0479, "reward": 1.2083333730697632, "reward_std": 0.45827316492795944, "rewards/equation_reward_func": 0.25000000558793545, "rewards/format_reward_func": 0.9583333432674408, "step": 332 }, { "completion_length": 59.72916841506958, "epoch": 0.0592, "grad_norm": 1.527266988721381, "kl": 3.48388671875, "learning_rate": 1.674027560307927e-07, "loss": 0.1397, "reward": 1.0000000298023224, "reward_std": 0.5845837779343128, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.8125000149011612, "step": 333 }, { "completion_length": 59.895835399627686, "epoch": 0.05937777777777778, "grad_norm": 1.050595762515859, "kl": 2.544921875, "learning_rate": 1.6472135208057125e-07, "loss": 0.1018, "reward": 0.9583333656191826, "reward_std": 0.39079636335372925, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.8750000223517418, "step": 334 }, { "completion_length": 54.583335876464844, "epoch": 0.059555555555555556, "grad_norm": 1.2933441010118374, "kl": 5.658203125, "learning_rate": 1.6205735540510674e-07, "loss": 0.2263, "reward": 0.8541666865348816, "reward_std": 0.6126096807420254, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.7291666939854622, "step": 335 }, { "completion_length": 53.47916793823242, "epoch": 0.05973333333333333, "grad_norm": 4.911862067712068, "kl": 6.6640625, "learning_rate": 1.5941090431598653e-07, "loss": 0.2666, "reward": 1.0625000223517418, "reward_std": 0.6126096844673157, "rewards/equation_reward_func": 0.25000000558793545, "rewards/format_reward_func": 0.8125000223517418, "step": 336 }, { "completion_length": 49.62500238418579, "epoch": 0.05991111111111111, "grad_norm": 1.538888251192996, "kl": 5.263671875, "learning_rate": 1.5678213621385178e-07, "loss": 0.2103, "reward": 0.6875000242143869, "reward_std": 0.5668769627809525, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.6041666883975267, "step": 337 }, { "completion_length": 54.29166841506958, "epoch": 0.060088888888888886, "grad_norm": 1.4585132738043682, "kl": 5.94921875, "learning_rate": 1.5417118758126408e-07, "loss": 0.2382, "reward": 0.9166667014360428, "reward_std": 0.6034007929265499, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.7500000149011612, "step": 338 }, { "completion_length": 51.14583492279053, "epoch": 0.06026666666666667, "grad_norm": 4.340996859120451, "kl": 7.66015625, "learning_rate": 1.515781939756186e-07, "loss": 0.3064, "reward": 0.7500000149011612, "reward_std": 0.5763868018984795, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.666666679084301, "step": 339 }, { "completion_length": 53.10416793823242, "epoch": 0.060444444444444446, "grad_norm": 7.13627262359258, "kl": 7.0458984375, "learning_rate": 1.490032900221068e-07, "loss": 0.2815, "reward": 0.7916666939854622, "reward_std": 0.4743013270199299, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.7500000223517418, "step": 340 }, { "completion_length": 51.14583492279053, "epoch": 0.06062222222222222, "grad_norm": 8.837913822415763, "kl": 8.7421875, "learning_rate": 1.4644660940672627e-07, "loss": 0.3497, "reward": 0.6875000167638063, "reward_std": 0.5780420526862144, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.583333345130086, "step": 341 }, { "completion_length": 54.68750190734863, "epoch": 0.0608, "grad_norm": 4.585825527377868, "kl": 4.6943359375, "learning_rate": 1.4390828486934058e-07, "loss": 0.1878, "reward": 0.8750000149011612, "reward_std": 0.5441443584859371, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.770833358168602, "step": 342 }, { "completion_length": 55.47916841506958, "epoch": 0.06097777777777778, "grad_norm": 2.5316402861873586, "kl": 4.51953125, "learning_rate": 1.4138844819678725e-07, "loss": 0.1809, "reward": 0.895833358168602, "reward_std": 0.552109844982624, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.7916666939854622, "step": 343 }, { "completion_length": 58.97916841506958, "epoch": 0.06115555555555555, "grad_norm": 1.300871033486432, "kl": 3.39404296875, "learning_rate": 1.3888723021603526e-07, "loss": 0.1359, "reward": 0.9166666865348816, "reward_std": 0.583796463906765, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.770833358168602, "step": 344 }, { "completion_length": 58.875001430511475, "epoch": 0.06133333333333333, "grad_norm": 2.981671476687896, "kl": 3.55615234375, "learning_rate": 1.3640476078739295e-07, "loss": 0.1422, "reward": 0.8958333656191826, "reward_std": 0.43150830641388893, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.833333358168602, "step": 345 }, { "completion_length": 57.83333492279053, "epoch": 0.061511111111111114, "grad_norm": 2.076358536272458, "kl": 2.014404296875, "learning_rate": 1.3394116879776567e-07, "loss": 0.0805, "reward": 1.0416667014360428, "reward_std": 0.47278038039803505, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.8750000149011612, "step": 346 }, { "completion_length": 60.250001430511475, "epoch": 0.06168888888888889, "grad_norm": 1.8103398488618279, "kl": 1.98193359375, "learning_rate": 1.3149658215396475e-07, "loss": 0.0794, "reward": 0.8750000298023224, "reward_std": 0.49578551203012466, "rewards/equation_reward_func": 0.1041666679084301, "rewards/format_reward_func": 0.770833358168602, "step": 347 }, { "completion_length": 64.27083539962769, "epoch": 0.06186666666666667, "grad_norm": 3.140491497435094, "kl": 1.261474609375, "learning_rate": 1.2907112777606576e-07, "loss": 0.0505, "reward": 1.1875000298023224, "reward_std": 0.5744358189404011, "rewards/equation_reward_func": 0.29166667349636555, "rewards/format_reward_func": 0.895833358168602, "step": 348 }, { "completion_length": 59.62500190734863, "epoch": 0.062044444444444444, "grad_norm": 1.061446640667196, "kl": 1.03369140625, "learning_rate": 1.2666493159081942e-07, "loss": 0.0413, "reward": 1.0000000223517418, "reward_std": 0.3102184757590294, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8958333507180214, "step": 349 }, { "completion_length": 61.47916889190674, "epoch": 0.06222222222222222, "grad_norm": 3.6962335454618858, "kl": 1.625244140625, "learning_rate": 1.2427811852511395e-07, "loss": 0.0649, "reward": 1.1250000298023224, "reward_std": 0.3937234431505203, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.9375000149011612, "step": 350 }, { "completion_length": 59.125001430511475, "epoch": 0.0624, "grad_norm": 1.6816913213888882, "kl": 1.1396484375, "learning_rate": 1.219108124994887e-07, "loss": 0.0455, "reward": 1.0000000298023224, "reward_std": 0.32624663412570953, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.9166666865348816, "step": 351 }, { "completion_length": 64.35416793823242, "epoch": 0.06257777777777777, "grad_norm": 0.8522481505886366, "kl": 0.689208984375, "learning_rate": 1.1956313642169973e-07, "loss": 0.0276, "reward": 1.0625000149011612, "reward_std": 0.27369464561343193, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.9583333432674408, "step": 352 }, { "completion_length": 61.62500190734863, "epoch": 0.06275555555555555, "grad_norm": 1.7897422346452299, "kl": 0.836669921875, "learning_rate": 1.1723521218034004e-07, "loss": 0.0335, "reward": 1.0833333879709244, "reward_std": 0.41380149126052856, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.9375000149011612, "step": 353 }, { "completion_length": 61.08333444595337, "epoch": 0.06293333333333333, "grad_norm": 2.0178846511892803, "kl": 1.092041015625, "learning_rate": 1.1492716063850971e-07, "loss": 0.0437, "reward": 0.958333358168602, "reward_std": 0.23116152733564377, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.9166666716337204, "step": 354 }, { "completion_length": 60.52083492279053, "epoch": 0.06311111111111112, "grad_norm": 2.7610480485849185, "kl": 2.68701171875, "learning_rate": 1.126391016275422e-07, "loss": 0.1075, "reward": 1.062500037252903, "reward_std": 0.34674229100346565, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.9375000074505806, "step": 355 }, { "completion_length": 55.91666793823242, "epoch": 0.0632888888888889, "grad_norm": 1.7083927263129774, "kl": 2.089111328125, "learning_rate": 1.1037115394078162e-07, "loss": 0.0836, "reward": 1.0416667088866234, "reward_std": 0.46722716465592384, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.8958333507180214, "step": 356 }, { "completion_length": 56.937501430511475, "epoch": 0.06346666666666667, "grad_norm": 1.569207458888609, "kl": 1.089111328125, "learning_rate": 1.0812343532741569e-07, "loss": 0.0436, "reward": 1.0625000223517418, "reward_std": 0.3714948333799839, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.8958333358168602, "step": 357 }, { "completion_length": 60.500001430511475, "epoch": 0.06364444444444445, "grad_norm": 3.1252392054257516, "kl": 1.154296875, "learning_rate": 1.058960624863629e-07, "loss": 0.0462, "reward": 1.062500037252903, "reward_std": 0.432873398065567, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.8958333507180214, "step": 358 }, { "completion_length": 60.75000190734863, "epoch": 0.06382222222222222, "grad_norm": 2.640265426366037, "kl": 0.833984375, "learning_rate": 1.0368915106021253e-07, "loss": 0.0334, "reward": 1.1041667312383652, "reward_std": 0.4864138960838318, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.9166666865348816, "step": 359 }, { "completion_length": 61.25000190734863, "epoch": 0.064, "grad_norm": 3.448614558728069, "kl": 1.076416015625, "learning_rate": 1.015028156292212e-07, "loss": 0.0431, "reward": 1.0833333656191826, "reward_std": 0.48238347843289375, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.8958333507180214, "step": 360 }, { "completion_length": 58.187501430511475, "epoch": 0.06417777777777778, "grad_norm": 4.146739619804781, "kl": 0.870361328125, "learning_rate": 9.933716970536427e-08, "loss": 0.0348, "reward": 1.1041667014360428, "reward_std": 0.5266816467046738, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 0.8750000223517418, "step": 361 }, { "completion_length": 59.02083444595337, "epoch": 0.06435555555555555, "grad_norm": 1.9186073301177928, "kl": 1.1279296875, "learning_rate": 9.719232572644187e-08, "loss": 0.0451, "reward": 1.0625000298023224, "reward_std": 0.456334613263607, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.8958333507180214, "step": 362 }, { "completion_length": 58.270835399627686, "epoch": 0.06453333333333333, "grad_norm": 3.1282429101773346, "kl": 1.857666015625, "learning_rate": 9.506839505024145e-08, "loss": 0.0743, "reward": 1.1250000149011612, "reward_std": 0.5089099928736687, "rewards/equation_reward_func": 0.22916666977107525, "rewards/format_reward_func": 0.8958333507180214, "step": 363 }, { "completion_length": 58.500001430511475, "epoch": 0.06471111111111111, "grad_norm": 1.2404291380846855, "kl": 2.42529296875, "learning_rate": 9.296548794875658e-08, "loss": 0.0971, "reward": 0.9791667088866234, "reward_std": 0.5063771307468414, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8541666939854622, "step": 364 }, { "completion_length": 54.187501430511475, "epoch": 0.06488888888888888, "grad_norm": 1.4856882416240396, "kl": 2.43212890625, "learning_rate": 9.088371360246105e-08, "loss": 0.0974, "reward": 0.9583333767950535, "reward_std": 0.5198958218097687, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8125000186264515, "step": 365 }, { "completion_length": 60.020835399627686, "epoch": 0.06506666666666666, "grad_norm": 1.2813862007137036, "kl": 2.4736328125, "learning_rate": 8.882318009464123e-08, "loss": 0.099, "reward": 0.9166666865348816, "reward_std": 0.347730815410614, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.854166679084301, "step": 366 }, { "completion_length": 56.47916793823242, "epoch": 0.06524444444444444, "grad_norm": 1.4700306877141611, "kl": 2.24755859375, "learning_rate": 8.678399440578365e-08, "loss": 0.0899, "reward": 0.9375000223517418, "reward_std": 0.5813841745257378, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.7708333507180214, "step": 367 }, { "completion_length": 56.85416841506958, "epoch": 0.06542222222222223, "grad_norm": 2.152590597555675, "kl": 2.2392578125, "learning_rate": 8.476626240802099e-08, "loss": 0.0897, "reward": 0.9375000223517418, "reward_std": 0.4938579201698303, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8125000186264515, "step": 368 }, { "completion_length": 52.187500953674316, "epoch": 0.0656, "grad_norm": 2.381613257880136, "kl": 3.70361328125, "learning_rate": 8.277008885963593e-08, "loss": 0.1481, "reward": 0.979166679084301, "reward_std": 0.6662384197115898, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.770833358168602, "step": 369 }, { "completion_length": 53.43750190734863, "epoch": 0.06577777777777778, "grad_norm": 1.978390854604455, "kl": 2.68603515625, "learning_rate": 8.079557739962128e-08, "loss": 0.1073, "reward": 0.937500037252903, "reward_std": 0.6371357701718807, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.7500000223517418, "step": 370 }, { "completion_length": 61.43750190734863, "epoch": 0.06595555555555556, "grad_norm": 1.670924504713265, "kl": 2.015625, "learning_rate": 7.884283054229956e-08, "loss": 0.0807, "reward": 0.9583333507180214, "reward_std": 0.36779123172163963, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.8750000223517418, "step": 371 }, { "completion_length": 55.312500953674316, "epoch": 0.06613333333333334, "grad_norm": 1.7031503147281266, "kl": 2.489501953125, "learning_rate": 7.691194967200098e-08, "loss": 0.0995, "reward": 1.041666705161333, "reward_std": 0.58039565756917, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 0.8125000186264515, "step": 372 }, { "completion_length": 57.29166889190674, "epoch": 0.06631111111111111, "grad_norm": 1.4511519173022578, "kl": 2.66552734375, "learning_rate": 7.500303503779897e-08, "loss": 0.1066, "reward": 1.0625000298023224, "reward_std": 0.6057954281568527, "rewards/equation_reward_func": 0.2708333395421505, "rewards/format_reward_func": 0.791666679084301, "step": 373 }, { "completion_length": 50.22916793823242, "epoch": 0.06648888888888889, "grad_norm": 2.3542410534514118, "kl": 4.15283203125, "learning_rate": 7.311618574830569e-08, "loss": 0.1664, "reward": 0.8541667014360428, "reward_std": 0.5683979243040085, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.7291666865348816, "step": 374 }, { "completion_length": 56.187501430511475, "epoch": 0.06666666666666667, "grad_norm": 1.3402956105324395, "kl": 2.796142578125, "learning_rate": 7.125149976652684e-08, "loss": 0.1119, "reward": 0.937500037252903, "reward_std": 0.4822668172419071, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8125000149011612, "step": 375 }, { "completion_length": 56.437501430511475, "epoch": 0.06684444444444444, "grad_norm": 1.3065544985054902, "kl": 3.40673828125, "learning_rate": 6.940907390477457e-08, "loss": 0.136, "reward": 1.020833358168602, "reward_std": 0.41557733342051506, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8750000149011612, "step": 376 }, { "completion_length": 47.29166793823242, "epoch": 0.06702222222222222, "grad_norm": 3.1009623462965976, "kl": 5.3515625, "learning_rate": 6.758900381964228e-08, "loss": 0.2139, "reward": 0.7500000298023224, "reward_std": 0.5388510599732399, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.708333358168602, "step": 377 }, { "completion_length": 53.54166793823242, "epoch": 0.0672, "grad_norm": 1.1872610426825552, "kl": 3.68310546875, "learning_rate": 6.579138400703715e-08, "loss": 0.1474, "reward": 0.8125000223517418, "reward_std": 0.4662386476993561, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.7500000149011612, "step": 378 }, { "completion_length": 51.562501430511475, "epoch": 0.06737777777777777, "grad_norm": 3.57334398201852, "kl": 4.5546875, "learning_rate": 6.401630779727451e-08, "loss": 0.1822, "reward": 0.8333333395421505, "reward_std": 0.37490642443299294, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.7708333469927311, "step": 379 }, { "completion_length": 51.125001430511475, "epoch": 0.06755555555555555, "grad_norm": 2.66140039223916, "kl": 3.92041015625, "learning_rate": 6.22638673502327e-08, "loss": 0.1571, "reward": 0.8958333544433117, "reward_std": 0.43363041803240776, "rewards/equation_reward_func": 0.1041666679084301, "rewards/format_reward_func": 0.7916666828095913, "step": 380 }, { "completion_length": 53.39583492279053, "epoch": 0.06773333333333334, "grad_norm": 1.3849153453116665, "kl": 2.9033203125, "learning_rate": 6.05341536505673e-08, "loss": 0.1163, "reward": 1.1041667088866234, "reward_std": 0.5925082266330719, "rewards/equation_reward_func": 0.2708333395421505, "rewards/format_reward_func": 0.833333358168602, "step": 381 }, { "completion_length": 51.66666793823242, "epoch": 0.06791111111111112, "grad_norm": 1.217375000722545, "kl": 4.306884765625, "learning_rate": 5.882725650298787e-08, "loss": 0.1719, "reward": 0.8958333656191826, "reward_std": 0.5843112505972385, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.7500000149011612, "step": 382 }, { "completion_length": 52.29166793823242, "epoch": 0.0680888888888889, "grad_norm": 1.3213564242297655, "kl": 3.68896484375, "learning_rate": 5.714326452759549e-08, "loss": 0.1475, "reward": 0.8750000223517418, "reward_std": 0.5200340487062931, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.7708333507180214, "step": 383 }, { "completion_length": 53.000001430511475, "epoch": 0.06826666666666667, "grad_norm": 1.4169493034835212, "kl": 3.507080078125, "learning_rate": 5.548226515528132e-08, "loss": 0.14, "reward": 0.9791666939854622, "reward_std": 0.5409447588026524, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8333333656191826, "step": 384 }, { "completion_length": 58.37500190734863, "epoch": 0.06844444444444445, "grad_norm": 1.1023233854193553, "kl": 2.56787109375, "learning_rate": 5.384434462318777e-08, "loss": 0.1028, "reward": 0.9375000298023224, "reward_std": 0.4688647836446762, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.833333358168602, "step": 385 }, { "completion_length": 56.62500238418579, "epoch": 0.06862222222222222, "grad_norm": 1.4791889195725796, "kl": 2.52001953125, "learning_rate": 5.222958797023036e-08, "loss": 0.1008, "reward": 1.1458333693444729, "reward_std": 0.5843112505972385, "rewards/equation_reward_func": 0.2916666716337204, "rewards/format_reward_func": 0.8541666828095913, "step": 386 }, { "completion_length": 57.45833492279053, "epoch": 0.0688, "grad_norm": 1.246850620199564, "kl": 2.88037109375, "learning_rate": 5.063807903268369e-08, "loss": 0.115, "reward": 1.0625000298023224, "reward_std": 0.5097602866590023, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.895833358168602, "step": 387 }, { "completion_length": 52.08333444595337, "epoch": 0.06897777777777778, "grad_norm": 1.9646365955378329, "kl": 3.38916015625, "learning_rate": 4.9069900439828115e-08, "loss": 0.1355, "reward": 1.0416666865348816, "reward_std": 0.40530357882380486, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.895833358168602, "step": 388 }, { "completion_length": 51.47916841506958, "epoch": 0.06915555555555555, "grad_norm": 5.101279066687743, "kl": 4.92578125, "learning_rate": 4.7525133609659484e-08, "loss": 0.197, "reward": 0.7916666939854622, "reward_std": 0.5373301096260548, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.7291666865348816, "step": 389 }, { "completion_length": 55.020835876464844, "epoch": 0.06933333333333333, "grad_norm": 2.859624921800997, "kl": 3.75, "learning_rate": 4.600385874466256e-08, "loss": 0.1498, "reward": 0.9166666865348816, "reward_std": 0.34296492487192154, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.854166679084301, "step": 390 }, { "completion_length": 52.208335399627686, "epoch": 0.0695111111111111, "grad_norm": 1.2049238425693272, "kl": 3.3515625, "learning_rate": 4.4506154827646915e-08, "loss": 0.1341, "reward": 0.9583333805203438, "reward_std": 0.5388510636985302, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8125000223517418, "step": 391 }, { "completion_length": 53.70833492279053, "epoch": 0.06968888888888888, "grad_norm": 1.8756766886500422, "kl": 3.8017578125, "learning_rate": 4.303209961764587e-08, "loss": 0.1517, "reward": 0.9375000149011612, "reward_std": 0.6559193283319473, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.7500000223517418, "step": 392 }, { "completion_length": 53.31250047683716, "epoch": 0.06986666666666666, "grad_norm": 2.233928517978556, "kl": 2.48046875, "learning_rate": 4.158176964587967e-08, "loss": 0.0993, "reward": 1.0208333805203438, "reward_std": 0.5134512856602669, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.8541666865348816, "step": 393 }, { "completion_length": 54.958335399627686, "epoch": 0.07004444444444445, "grad_norm": 2.1952634154416963, "kl": 2.598876953125, "learning_rate": 4.015524021178196e-08, "loss": 0.1041, "reward": 1.0833333730697632, "reward_std": 0.5025997683405876, "rewards/equation_reward_func": 0.2083333395421505, "rewards/format_reward_func": 0.8750000223517418, "step": 394 }, { "completion_length": 54.91666793823242, "epoch": 0.07022222222222223, "grad_norm": 1.556450915650428, "kl": 3.673828125, "learning_rate": 3.8752585379090317e-08, "loss": 0.1468, "reward": 0.9791666939854622, "reward_std": 0.5681380145251751, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.8125000223517418, "step": 395 }, { "completion_length": 59.64583492279053, "epoch": 0.0704, "grad_norm": 2.3442496072285035, "kl": 1.22265625, "learning_rate": 3.7373877972001255e-08, "loss": 0.0489, "reward": 1.166666716337204, "reward_std": 0.49578551575541496, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 0.9375000149011612, "step": 396 }, { "completion_length": 56.416667461395264, "epoch": 0.07057777777777778, "grad_norm": 1.382319715238746, "kl": 2.299560546875, "learning_rate": 3.601918957138844e-08, "loss": 0.092, "reward": 1.125000037252903, "reward_std": 0.5317768938839436, "rewards/equation_reward_func": 0.22916666977107525, "rewards/format_reward_func": 0.8958333507180214, "step": 397 }, { "completion_length": 55.00000190734863, "epoch": 0.07075555555555556, "grad_norm": 1.8307641531327195, "kl": 3.07080078125, "learning_rate": 3.46885905110873e-08, "loss": 0.1227, "reward": 1.0208333730697632, "reward_std": 0.532146617770195, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.8541666939854622, "step": 398 }, { "completion_length": 54.66666793823242, "epoch": 0.07093333333333333, "grad_norm": 1.5553001993082431, "kl": 3.23291015625, "learning_rate": 3.3382149874242814e-08, "loss": 0.1295, "reward": 1.0000000298023224, "reward_std": 0.502599760890007, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8541666865348816, "step": 399 }, { "completion_length": 54.60416841506958, "epoch": 0.07111111111111111, "grad_norm": 1.574316545264386, "kl": 3.5634765625, "learning_rate": 3.20999354897229e-08, "loss": 0.1431, "reward": 0.9583333507180214, "reward_std": 0.4067096970975399, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.854166679084301, "step": 400 }, { "completion_length": 61.43750190734863, "epoch": 0.07128888888888889, "grad_norm": 2.037039114885329, "kl": 2.79345703125, "learning_rate": 3.0842013928596754e-08, "loss": 0.1117, "reward": 0.958333358168602, "reward_std": 0.2957112602889538, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8958333432674408, "step": 401 }, { "completion_length": 55.125001430511475, "epoch": 0.07146666666666666, "grad_norm": 3.5875644910793727, "kl": 3.16015625, "learning_rate": 2.9608450500678562e-08, "loss": 0.1265, "reward": 0.8333333656191826, "reward_std": 0.41380149498581886, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 0.8125000298023224, "step": 402 }, { "completion_length": 54.91666841506958, "epoch": 0.07164444444444444, "grad_norm": 1.6377386422578162, "kl": 1.947998046875, "learning_rate": 2.839930925113715e-08, "loss": 0.0777, "reward": 1.104166716337204, "reward_std": 0.4778187908232212, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.9375000149011612, "step": 403 }, { "completion_length": 57.47916793823242, "epoch": 0.07182222222222222, "grad_norm": 1.0788472393855095, "kl": 2.4091796875, "learning_rate": 2.721465295716996e-08, "loss": 0.0963, "reward": 0.9791667088866234, "reward_std": 0.46483253315091133, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.8541666865348816, "step": 404 }, { "completion_length": 56.187500953674316, "epoch": 0.072, "grad_norm": 1.5018621132397523, "kl": 1.85888671875, "learning_rate": 2.605454312474448e-08, "loss": 0.0745, "reward": 1.1666667014360428, "reward_std": 0.5234047770500183, "rewards/equation_reward_func": 0.2500000074505806, "rewards/format_reward_func": 0.9166666865348816, "step": 405 }, { "completion_length": 56.312501430511475, "epoch": 0.07217777777777777, "grad_norm": 2.078201578902569, "kl": 2.600341796875, "learning_rate": 2.4919039985404622e-08, "loss": 0.1039, "reward": 1.000000037252903, "reward_std": 0.5240839384496212, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8541666865348816, "step": 406 }, { "completion_length": 55.083335399627686, "epoch": 0.07235555555555556, "grad_norm": 2.2993454586501643, "kl": 3.27734375, "learning_rate": 2.380820249314375e-08, "loss": 0.131, "reward": 0.9791667088866234, "reward_std": 0.48782002553343773, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8333333507180214, "step": 407 }, { "completion_length": 56.854167461395264, "epoch": 0.07253333333333334, "grad_norm": 1.3535072176160663, "kl": 2.1513671875, "learning_rate": 2.2722088321343258e-08, "loss": 0.0861, "reward": 0.9166666939854622, "reward_std": 0.42678775265812874, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.8541666939854622, "step": 408 }, { "completion_length": 55.250000953674316, "epoch": 0.07271111111111112, "grad_norm": 1.8133842507703675, "kl": 2.7421875, "learning_rate": 2.1660753859779223e-08, "loss": 0.1095, "reward": 1.0416667088866234, "reward_std": 0.5461693182587624, "rewards/equation_reward_func": 0.2083333358168602, "rewards/format_reward_func": 0.8333333507180214, "step": 409 }, { "completion_length": 55.14583396911621, "epoch": 0.07288888888888889, "grad_norm": 0.9743576087041721, "kl": 2.7861328125, "learning_rate": 2.0624254211693894e-08, "loss": 0.1113, "reward": 1.000000037252903, "reward_std": 0.39079635962843895, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8958333507180214, "step": 410 }, { "completion_length": 55.02083492279053, "epoch": 0.07306666666666667, "grad_norm": 2.516245940031829, "kl": 3.21875, "learning_rate": 1.9612643190935196e-08, "loss": 0.1288, "reward": 0.9791667088866234, "reward_std": 0.5782451070845127, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.8125000223517418, "step": 411 }, { "completion_length": 60.187500953674316, "epoch": 0.07324444444444445, "grad_norm": 2.6330682117724113, "kl": 1.1962890625, "learning_rate": 1.8625973319162602e-08, "loss": 0.0478, "reward": 1.020833358168602, "reward_std": 0.4778187870979309, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.8750000298023224, "step": 412 }, { "completion_length": 51.312500953674316, "epoch": 0.07342222222222222, "grad_norm": 1.1821663445604937, "kl": 4.4501953125, "learning_rate": 1.7664295823120347e-08, "loss": 0.178, "reward": 0.8958333507180214, "reward_std": 0.618318747729063, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.7291666865348816, "step": 413 }, { "completion_length": 57.75000190734863, "epoch": 0.0736, "grad_norm": 0.8122615265168446, "kl": 2.16064453125, "learning_rate": 1.672766063197789e-08, "loss": 0.0861, "reward": 0.9791666939854622, "reward_std": 0.37575671449303627, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.8958333507180214, "step": 414 }, { "completion_length": 57.22916841506958, "epoch": 0.07377777777777778, "grad_norm": 3.0181550761407716, "kl": 1.967529296875, "learning_rate": 1.5816116374737452e-08, "loss": 0.0785, "reward": 1.0833333730697632, "reward_std": 0.39079636335372925, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.9375000149011612, "step": 415 }, { "completion_length": 55.375000953674316, "epoch": 0.07395555555555555, "grad_norm": 1.1321615634392828, "kl": 0.98779296875, "learning_rate": 1.492971037770924e-08, "loss": 0.0395, "reward": 1.1458333730697632, "reward_std": 0.34674229472875595, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.9583333432674408, "step": 416 }, { "completion_length": 55.041667461395264, "epoch": 0.07413333333333333, "grad_norm": 1.1137121433760955, "kl": 2.485595703125, "learning_rate": 1.4068488662054733e-08, "loss": 0.0994, "reward": 1.0833333805203438, "reward_std": 0.49578550457954407, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.8958333507180214, "step": 417 }, { "completion_length": 59.33333444595337, "epoch": 0.0743111111111111, "grad_norm": 1.8567494155306574, "kl": 3.08056640625, "learning_rate": 1.3232495941396637e-08, "loss": 0.1232, "reward": 1.0416667014360428, "reward_std": 0.581656701862812, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.8541666939854622, "step": 418 }, { "completion_length": 59.79166793823242, "epoch": 0.07448888888888888, "grad_norm": 1.3834584558844922, "kl": 2.0390625, "learning_rate": 1.2421775619498199e-08, "loss": 0.0815, "reward": 1.1250000298023224, "reward_std": 0.3532840199768543, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 0.9583333432674408, "step": 419 }, { "completion_length": 58.66666793823242, "epoch": 0.07466666666666667, "grad_norm": 0.8828120170847628, "kl": 1.23046875, "learning_rate": 1.1636369788008971e-08, "loss": 0.0493, "reward": 1.1875000298023224, "reward_std": 0.4722479432821274, "rewards/equation_reward_func": 0.25000000558793545, "rewards/format_reward_func": 0.9375000149011612, "step": 420 }, { "completion_length": 53.39583444595337, "epoch": 0.07484444444444445, "grad_norm": 1.6828918296780182, "kl": 2.09814453125, "learning_rate": 1.0876319224279895e-08, "loss": 0.0837, "reward": 0.9791666939854622, "reward_std": 0.44323352351784706, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.854166679084301, "step": 421 }, { "completion_length": 55.08333444595337, "epoch": 0.07502222222222223, "grad_norm": 1.320246490235028, "kl": 3.072265625, "learning_rate": 1.014166338924627e-08, "loss": 0.123, "reward": 0.9375000298023224, "reward_std": 0.4662386514246464, "rewards/equation_reward_func": 0.1041666679084301, "rewards/format_reward_func": 0.833333358168602, "step": 422 }, { "completion_length": 53.06250190734863, "epoch": 0.0752, "grad_norm": 1.4581841364335688, "kl": 2.12109375, "learning_rate": 9.432440425378663e-09, "loss": 0.0848, "reward": 1.0208333730697632, "reward_std": 0.38435182720422745, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8958333507180214, "step": 423 }, { "completion_length": 56.97916841506958, "epoch": 0.07537777777777778, "grad_norm": 3.1980164553732098, "kl": 2.3525390625, "learning_rate": 8.748687154702672e-09, "loss": 0.0941, "reward": 1.0416666939854622, "reward_std": 0.5055268332362175, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.854166679084301, "step": 424 }, { "completion_length": 57.645835399627686, "epoch": 0.07555555555555556, "grad_norm": 1.5490994703153829, "kl": 1.429931640625, "learning_rate": 8.090439076887556e-09, "loss": 0.0573, "reward": 1.2500000447034836, "reward_std": 0.49578551575541496, "rewards/equation_reward_func": 0.31250000558793545, "rewards/format_reward_func": 0.9375000149011612, "step": 425 }, { "completion_length": 55.70833492279053, "epoch": 0.07573333333333333, "grad_norm": 0.9371848147502277, "kl": 2.319580078125, "learning_rate": 7.457730367402549e-09, "loss": 0.0928, "reward": 1.0416666939854622, "reward_std": 0.46833235025405884, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8958333507180214, "step": 426 }, { "completion_length": 55.25000238418579, "epoch": 0.07591111111111111, "grad_norm": 1.4055409510031482, "kl": 3.28759765625, "learning_rate": 6.850593875742827e-09, "loss": 0.1316, "reward": 1.0208333730697632, "reward_std": 0.566160973161459, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.8333333507180214, "step": 427 }, { "completion_length": 55.562501430511475, "epoch": 0.07608888888888889, "grad_norm": 1.344704452916086, "kl": 3.205810546875, "learning_rate": 6.269061123724162e-09, "loss": 0.1285, "reward": 0.9583333730697632, "reward_std": 0.43123578280210495, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 0.8541666865348816, "step": 428 }, { "completion_length": 59.22916793823242, "epoch": 0.07626666666666666, "grad_norm": 2.1608184033551643, "kl": 1.736328125, "learning_rate": 5.713162303845886e-09, "loss": 0.0696, "reward": 1.2083333879709244, "reward_std": 0.5388510562479496, "rewards/equation_reward_func": 0.2708333395421505, "rewards/format_reward_func": 0.9375000149011612, "step": 429 }, { "completion_length": 54.125001430511475, "epoch": 0.07644444444444444, "grad_norm": 1.8308437940638247, "kl": 1.9599609375, "learning_rate": 5.182926277723821e-09, "loss": 0.0783, "reward": 1.0625000223517418, "reward_std": 0.5724712051451206, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 0.8333333432674408, "step": 430 }, { "completion_length": 56.500000953674316, "epoch": 0.07662222222222222, "grad_norm": 1.6765820181739854, "kl": 2.41552734375, "learning_rate": 4.678380574591356e-09, "loss": 0.0966, "reward": 1.0000000298023224, "reward_std": 0.438050027936697, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8541666939854622, "step": 431 }, { "completion_length": 57.27083492279053, "epoch": 0.0768, "grad_norm": 1.3657278862024629, "kl": 2.685546875, "learning_rate": 4.199551389870659e-09, "loss": 0.1074, "reward": 0.8541666865348816, "reward_std": 0.45774073153734207, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 0.7916666865348816, "step": 432 }, { "completion_length": 55.64583492279053, "epoch": 0.07697777777777778, "grad_norm": 0.8539346194962987, "kl": 2.49462890625, "learning_rate": 3.746463583812143e-09, "loss": 0.0997, "reward": 1.041666679084301, "reward_std": 0.4067096970975399, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.8958333507180214, "step": 433 }, { "completion_length": 57.875000953674316, "epoch": 0.07715555555555556, "grad_norm": 1.2719357827752074, "kl": 2.07666015625, "learning_rate": 3.3191406802041688e-09, "loss": 0.0831, "reward": 1.0416666939854622, "reward_std": 0.3332236036658287, "rewards/equation_reward_func": 0.12500000186264515, "rewards/format_reward_func": 0.916666679084301, "step": 434 }, { "completion_length": 54.312500953674316, "epoch": 0.07733333333333334, "grad_norm": 1.4529499100749974, "kl": 3.67041015625, "learning_rate": 2.9176048651513575e-09, "loss": 0.1465, "reward": 0.958333358168602, "reward_std": 0.5915607511997223, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 0.7708333507180214, "step": 435 }, { "completion_length": 55.08333492279053, "epoch": 0.07751111111111111, "grad_norm": 2.3005383670846693, "kl": 3.82958984375, "learning_rate": 2.541876985923119e-09, "loss": 0.1532, "reward": 0.9791667014360428, "reward_std": 0.49742312356829643, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.8333333432674408, "step": 436 }, { "completion_length": 54.20833444595337, "epoch": 0.07768888888888889, "grad_norm": 1.2230209357396296, "kl": 2.626220703125, "learning_rate": 2.1919765498708554e-09, "loss": 0.1052, "reward": 0.979166716337204, "reward_std": 0.4918699115514755, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.854166679084301, "step": 437 }, { "completion_length": 53.04166841506958, "epoch": 0.07786666666666667, "grad_norm": 1.2004863808274155, "kl": 2.7109375, "learning_rate": 1.867921723415433e-09, "loss": 0.1085, "reward": 1.0208333786576986, "reward_std": 0.507898073643446, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.812500013038516, "step": 438 }, { "completion_length": 56.416667461395264, "epoch": 0.07804444444444444, "grad_norm": 2.4507514591114656, "kl": 1.8857421875, "learning_rate": 1.5697293311039973e-09, "loss": 0.0755, "reward": 1.020833358168602, "reward_std": 0.4759799763560295, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.854166679084301, "step": 439 }, { "completion_length": 54.937501430511475, "epoch": 0.07822222222222222, "grad_norm": 2.2350073546663545, "kl": 2.593994140625, "learning_rate": 1.2974148547362228e-09, "loss": 0.1039, "reward": 1.0833333656191826, "reward_std": 0.5678940936923027, "rewards/equation_reward_func": 0.2291666716337204, "rewards/format_reward_func": 0.8541666939854622, "step": 440 }, { "completion_length": 57.60416793823242, "epoch": 0.0784, "grad_norm": 2.858115206523939, "kl": 1.82763671875, "learning_rate": 1.0509924325609598e-09, "loss": 0.0733, "reward": 1.020833358168602, "reward_std": 0.42745841667056084, "rewards/equation_reward_func": 0.1458333358168602, "rewards/format_reward_func": 0.8750000149011612, "step": 441 }, { "completion_length": 58.10416793823242, "epoch": 0.07857777777777777, "grad_norm": 0.8237264126414511, "kl": 2.230712890625, "learning_rate": 8.304748585417076e-10, "loss": 0.0891, "reward": 1.1458333805203438, "reward_std": 0.41129202395677567, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 0.9583333358168602, "step": 442 }, { "completion_length": 55.27083492279053, "epoch": 0.07875555555555555, "grad_norm": 2.025879478872808, "kl": 2.39501953125, "learning_rate": 6.358735816926475e-10, "loss": 0.0957, "reward": 1.1458333805203438, "reward_std": 0.5063771307468414, "rewards/equation_reward_func": 0.29166667722165585, "rewards/format_reward_func": 0.8541666865348816, "step": 443 }, { "completion_length": 56.437500953674316, "epoch": 0.07893333333333333, "grad_norm": 1.7721145491613786, "kl": 2.5361328125, "learning_rate": 4.671987054842841e-10, "loss": 0.1016, "reward": 0.9375000223517418, "reward_std": 0.3828308768570423, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.854166679084301, "step": 444 }, { "completion_length": 54.43750190734863, "epoch": 0.0791111111111111, "grad_norm": 2.0018604753277365, "kl": 2.74658203125, "learning_rate": 3.2445898731853216e-10, "loss": 0.1102, "reward": 0.937500037252903, "reward_std": 0.5536307953298092, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8125000298023224, "step": 445 }, { "completion_length": 55.375001430511475, "epoch": 0.0792888888888889, "grad_norm": 1.6563944165609519, "kl": 2.6494140625, "learning_rate": 2.076618380744133e-10, "loss": 0.1061, "reward": 0.9375000223517418, "reward_std": 0.4548136591911316, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.8541666939854622, "step": 446 }, { "completion_length": 56.208335399627686, "epoch": 0.07946666666666667, "grad_norm": 1.4046224808818546, "kl": 2.59716796875, "learning_rate": 1.16813321723197e-10, "loss": 0.1038, "reward": 1.0416667088866234, "reward_std": 0.528849832713604, "rewards/equation_reward_func": 0.16666666977107525, "rewards/format_reward_func": 0.8750000223517418, "step": 447 }, { "completion_length": 56.187501430511475, "epoch": 0.07964444444444445, "grad_norm": 1.9614743025012586, "kl": 2.751953125, "learning_rate": 5.191815501343066e-11, "loss": 0.11, "reward": 1.1041667014360428, "reward_std": 0.462188757956028, "rewards/equation_reward_func": 0.20833333767950535, "rewards/format_reward_func": 0.8958333432674408, "step": 448 }, { "completion_length": 58.187500953674316, "epoch": 0.07982222222222222, "grad_norm": 1.1288440912101967, "kl": 2.55517578125, "learning_rate": 1.2979707226135061e-11, "loss": 0.1023, "reward": 0.9791667014360428, "reward_std": 0.437777504324913, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.8541666865348816, "step": 449 }, { "completion_length": 57.750001430511475, "epoch": 0.08, "grad_norm": 1.6866388049647176, "kl": 1.45751953125, "learning_rate": 0.0, "loss": 0.0583, "reward": 1.1875000447034836, "reward_std": 0.32525811716914177, "rewards/equation_reward_func": 0.2083333395421505, "rewards/format_reward_func": 0.9791666716337204, "step": 450 }, { "epoch": 0.08, "step": 450, "total_flos": 0.0, "train_loss": 0.08981622397834212, "train_runtime": 6983.3902, "train_samples_per_second": 3.093, "train_steps_per_second": 0.064 } ], "logging_steps": 1, "max_steps": 450, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }