{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 452, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 6.375993978538379, "learning_rate": 1.0869565217391303e-08, "logits/chosen": -1.8598690032958984, "logits/rejected": -1.813749074935913, "logps/chosen": -155.32705688476562, "logps/rejected": -119.10728454589844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 6.761130122357207, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -1.7754908800125122, "logits/rejected": -1.9477850198745728, "logps/chosen": -164.1376495361328, "logps/rejected": -177.90771484375, "loss": 0.693, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": -0.0001304509787587449, "rewards/margins": -0.0002223679330199957, "rewards/rejected": 9.191703429678455e-05, "step": 10 }, { "epoch": 0.04, "grad_norm": 6.476650286184237, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -1.813336730003357, "logits/rejected": -1.8786985874176025, "logps/chosen": -154.4861297607422, "logps/rejected": -178.77951049804688, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.000607121444772929, "rewards/margins": 0.000769242993555963, "rewards/rejected": -0.00016212157788686454, "step": 20 }, { "epoch": 0.07, "grad_norm": 6.2480047328303705, "learning_rate": 3.260869565217391e-07, "logits/chosen": -1.729405164718628, "logits/rejected": -1.8028945922851562, "logps/chosen": -159.96788024902344, "logps/rejected": -153.2834930419922, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.003659260692074895, "rewards/margins": 0.0018362473929300904, "rewards/rejected": 0.001823012949898839, "step": 30 }, { "epoch": 0.09, "grad_norm": 6.780490815116132, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -1.715442419052124, "logits/rejected": -1.8770908117294312, "logps/chosen": -168.16839599609375, "logps/rejected": -172.1099853515625, "loss": 0.6899, "rewards/accuracies": 0.65625, "rewards/chosen": 0.010979695245623589, "rewards/margins": 0.005665521137416363, "rewards/rejected": 0.0053141750395298, "step": 40 }, { "epoch": 0.11, "grad_norm": 6.269256344947319, "learning_rate": 4.998802589665008e-07, "logits/chosen": -1.6737487316131592, "logits/rejected": -1.8300039768218994, "logps/chosen": -157.9713134765625, "logps/rejected": -184.8284912109375, "loss": 0.6862, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.024453260004520416, "rewards/margins": 0.0103492122143507, "rewards/rejected": 0.014104047790169716, "step": 50 }, { "epoch": 0.13, "grad_norm": 5.9502015182077, "learning_rate": 4.985344892885899e-07, "logits/chosen": -1.7060184478759766, "logits/rejected": -1.8548672199249268, "logps/chosen": -182.36965942382812, "logps/rejected": -164.8349151611328, "loss": 0.6821, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.052476245909929276, "rewards/margins": 0.026238251477479935, "rewards/rejected": 0.02623800002038479, "step": 60 }, { "epoch": 0.15, "grad_norm": 6.211310793736214, "learning_rate": 4.957013543421161e-07, "logits/chosen": -1.6781314611434937, "logits/rejected": -1.7970473766326904, "logps/chosen": -156.598388671875, "logps/rejected": -159.87399291992188, "loss": 0.6735, "rewards/accuracies": 0.75, "rewards/chosen": 0.06103619933128357, "rewards/margins": 0.040171049535274506, "rewards/rejected": 0.020865142345428467, "step": 70 }, { "epoch": 0.18, "grad_norm": 5.997223838682246, "learning_rate": 4.913978091441985e-07, "logits/chosen": -1.591805338859558, "logits/rejected": -1.6250261068344116, "logps/chosen": -149.93402099609375, "logps/rejected": -149.83056640625, "loss": 0.6686, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08923570066690445, "rewards/margins": 0.0577419213950634, "rewards/rejected": 0.03149377182126045, "step": 80 }, { "epoch": 0.2, "grad_norm": 5.698461942985052, "learning_rate": 4.856496084449218e-07, "logits/chosen": -1.6099445819854736, "logits/rejected": -1.7586275339126587, "logps/chosen": -168.14712524414062, "logps/rejected": -164.38485717773438, "loss": 0.6648, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.0959181934595108, "rewards/margins": 0.06268787384033203, "rewards/rejected": 0.03323032334446907, "step": 90 }, { "epoch": 0.22, "grad_norm": 5.821198867398168, "learning_rate": 4.784911525969344e-07, "logits/chosen": -1.5159536600112915, "logits/rejected": -1.6637051105499268, "logps/chosen": -144.79641723632812, "logps/rejected": -155.20135498046875, "loss": 0.6586, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.11348827183246613, "rewards/margins": 0.057287197560071945, "rewards/rejected": 0.056201063096523285, "step": 100 }, { "epoch": 0.22, "eval_logits/chosen": -1.7960463762283325, "eval_logits/rejected": -1.5849276781082153, "eval_logps/chosen": -131.66993713378906, "eval_logps/rejected": -149.56881713867188, "eval_loss": 0.6694082617759705, "eval_rewards/accuracies": 0.6931818127632141, "eval_rewards/chosen": 0.130432590842247, "eval_rewards/margins": 0.051959823817014694, "eval_rewards/rejected": 0.07847274094820023, "eval_runtime": 64.0941, "eval_samples_per_second": 10.516, "eval_steps_per_second": 0.343, "step": 100 }, { "epoch": 0.24, "grad_norm": 5.737042856571591, "learning_rate": 4.699652816850686e-07, "logits/chosen": -1.5323385000228882, "logits/rejected": -1.623490571975708, "logps/chosen": -153.6015625, "logps/rejected": -182.46792602539062, "loss": 0.6609, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.11847398430109024, "rewards/margins": 0.07614362984895706, "rewards/rejected": 0.04233035817742348, "step": 110 }, { "epoch": 0.27, "grad_norm": 5.744846632920582, "learning_rate": 4.6012301914802236e-07, "logits/chosen": -1.4478352069854736, "logits/rejected": -1.6309044361114502, "logps/chosen": -145.10116577148438, "logps/rejected": -180.8983154296875, "loss": 0.6529, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1110893040895462, "rewards/margins": 0.07829709351062775, "rewards/rejected": 0.032792218029499054, "step": 120 }, { "epoch": 0.29, "grad_norm": 5.866234048530386, "learning_rate": 4.490232664264109e-07, "logits/chosen": -1.5028413534164429, "logits/rejected": -1.4950730800628662, "logps/chosen": -157.7068634033203, "logps/rejected": -163.91094970703125, "loss": 0.6543, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.13327065110206604, "rewards/margins": 0.0991601049900055, "rewards/rejected": 0.03411053121089935, "step": 130 }, { "epoch": 0.31, "grad_norm": 5.2985790715652294, "learning_rate": 4.3673245046457916e-07, "logits/chosen": -1.439444899559021, "logits/rejected": -1.4795516729354858, "logps/chosen": -133.6232147216797, "logps/rejected": -162.96536254882812, "loss": 0.6428, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.12999793887138367, "rewards/margins": 0.10853303968906403, "rewards/rejected": 0.021464908495545387, "step": 140 }, { "epoch": 0.33, "grad_norm": 5.49213913037938, "learning_rate": 4.2332412617571544e-07, "logits/chosen": -1.470943570137024, "logits/rejected": -1.632310152053833, "logps/chosen": -152.8888702392578, "logps/rejected": -153.59072875976562, "loss": 0.6469, "rewards/accuracies": 0.75, "rewards/chosen": 0.13645324110984802, "rewards/margins": 0.09464041888713837, "rewards/rejected": 0.041812822222709656, "step": 150 }, { "epoch": 0.35, "grad_norm": 5.401465863070874, "learning_rate": 4.088785362493313e-07, "logits/chosen": -1.5582928657531738, "logits/rejected": -1.6405029296875, "logps/chosen": -157.95230102539062, "logps/rejected": -166.15577697753906, "loss": 0.6374, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.1728488802909851, "rewards/margins": 0.13387203216552734, "rewards/rejected": 0.03897686302661896, "step": 160 }, { "epoch": 0.38, "grad_norm": 5.73235647435814, "learning_rate": 3.93482130935458e-07, "logits/chosen": -1.4991676807403564, "logits/rejected": -1.6196138858795166, "logps/chosen": -147.5597381591797, "logps/rejected": -162.0667266845703, "loss": 0.6319, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.1789514720439911, "rewards/margins": 0.13890545070171356, "rewards/rejected": 0.040045998990535736, "step": 170 }, { "epoch": 0.4, "grad_norm": 5.503184592670663, "learning_rate": 3.772270506794322e-07, "logits/chosen": -1.4963356256484985, "logits/rejected": -1.6418964862823486, "logps/chosen": -137.38124084472656, "logps/rejected": -166.94546508789062, "loss": 0.6335, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.13972221314907074, "rewards/margins": 0.11094947904348373, "rewards/rejected": 0.028772741556167603, "step": 180 }, { "epoch": 0.42, "grad_norm": 5.574317629060601, "learning_rate": 3.6021057470346455e-07, "logits/chosen": -1.4465644359588623, "logits/rejected": -1.5275087356567383, "logps/chosen": -142.6824951171875, "logps/rejected": -147.18019104003906, "loss": 0.6315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1697317659854889, "rewards/margins": 0.1489827036857605, "rewards/rejected": 0.020749058574438095, "step": 190 }, { "epoch": 0.44, "grad_norm": 5.601814789329134, "learning_rate": 3.4253453883497864e-07, "logits/chosen": -1.5157060623168945, "logits/rejected": -1.5182088613510132, "logps/chosen": -138.18411254882812, "logps/rejected": -177.93359375, "loss": 0.6342, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.15571925044059753, "rewards/margins": 0.12424109876155853, "rewards/rejected": 0.0314781591296196, "step": 200 }, { "epoch": 0.44, "eval_logits/chosen": -1.7958874702453613, "eval_logits/rejected": -1.5702953338623047, "eval_logps/chosen": -127.2863540649414, "eval_logps/rejected": -148.07151794433594, "eval_loss": 0.6580806970596313, "eval_rewards/accuracies": 0.7272727489471436, "eval_rewards/chosen": 0.1742684245109558, "eval_rewards/margins": 0.08082254230976105, "eval_rewards/rejected": 0.09344588965177536, "eval_runtime": 64.1886, "eval_samples_per_second": 10.5, "eval_steps_per_second": 0.343, "step": 200 }, { "epoch": 0.46, "grad_norm": 6.199728402265358, "learning_rate": 3.2430472606575104e-07, "logits/chosen": -1.5152722597122192, "logits/rejected": -1.5520877838134766, "logps/chosen": -131.77743530273438, "logps/rejected": -161.01565551757812, "loss": 0.6259, "rewards/accuracies": 0.8125, "rewards/chosen": 0.17089861631393433, "rewards/margins": 0.1506127417087555, "rewards/rejected": 0.020285870879888535, "step": 210 }, { "epoch": 0.49, "grad_norm": 5.744865266952806, "learning_rate": 3.056302334890786e-07, "logits/chosen": -1.5393413305282593, "logits/rejected": -1.6224792003631592, "logps/chosen": -139.15541076660156, "logps/rejected": -153.6171417236328, "loss": 0.6265, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.181275874376297, "rewards/margins": 0.16251231729984283, "rewards/rejected": 0.018763558939099312, "step": 220 }, { "epoch": 0.51, "grad_norm": 5.612480895027258, "learning_rate": 2.866228194035623e-07, "logits/chosen": -1.5460705757141113, "logits/rejected": -1.6816442012786865, "logps/chosen": -143.0059356689453, "logps/rejected": -156.33035278320312, "loss": 0.6183, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.17381103336811066, "rewards/margins": 0.15255849063396454, "rewards/rejected": 0.021252544596791267, "step": 230 }, { "epoch": 0.53, "grad_norm": 5.69741102915849, "learning_rate": 2.673962344907953e-07, "logits/chosen": -1.509167194366455, "logits/rejected": -1.6316970586776733, "logps/chosen": -162.8449249267578, "logps/rejected": -151.0370635986328, "loss": 0.6158, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.17691971361637115, "rewards/margins": 0.16015170514583588, "rewards/rejected": 0.016767997294664383, "step": 240 }, { "epoch": 0.55, "grad_norm": 6.055880696709496, "learning_rate": 2.4806554106954945e-07, "logits/chosen": -1.558767557144165, "logits/rejected": -1.66998291015625, "logps/chosen": -154.49603271484375, "logps/rejected": -168.3751220703125, "loss": 0.6173, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.17227458953857422, "rewards/margins": 0.16756592690944672, "rewards/rejected": 0.004708637483417988, "step": 250 }, { "epoch": 0.58, "grad_norm": 5.955808609049162, "learning_rate": 2.287464245004132e-07, "logits/chosen": -1.4977203607559204, "logits/rejected": -1.6133390665054321, "logps/chosen": -134.21728515625, "logps/rejected": -176.34817504882812, "loss": 0.6141, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.16645415127277374, "rewards/margins": 0.17111308872699738, "rewards/rejected": -0.004658935125917196, "step": 260 }, { "epoch": 0.6, "grad_norm": 5.716956108430631, "learning_rate": 2.0955450086180881e-07, "logits/chosen": -1.5042165517807007, "logits/rejected": -1.6705764532089233, "logps/chosen": -128.129638671875, "logps/rejected": -168.9424591064453, "loss": 0.6087, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.18247731029987335, "rewards/margins": 0.19755597412586212, "rewards/rejected": -0.015078653581440449, "step": 270 }, { "epoch": 0.62, "grad_norm": 6.529628318945036, "learning_rate": 1.9060462504063227e-07, "logits/chosen": -1.6098215579986572, "logits/rejected": -1.7113733291625977, "logps/chosen": -152.48533630371094, "logps/rejected": -169.33935546875, "loss": 0.607, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.1736232191324234, "rewards/margins": 0.19309216737747192, "rewards/rejected": -0.019468944519758224, "step": 280 }, { "epoch": 0.64, "grad_norm": 5.964891862645329, "learning_rate": 1.7201020337827556e-07, "logits/chosen": -1.5367965698242188, "logits/rejected": -1.6592748165130615, "logps/chosen": -155.07492065429688, "logps/rejected": -157.03305053710938, "loss": 0.6077, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.15863782167434692, "rewards/margins": 0.16328461468219757, "rewards/rejected": -0.004646780900657177, "step": 290 }, { "epoch": 0.66, "grad_norm": 5.912017097517653, "learning_rate": 1.5388251498553261e-07, "logits/chosen": -1.5541818141937256, "logits/rejected": -1.7061693668365479, "logps/chosen": -151.648681640625, "logps/rejected": -161.48257446289062, "loss": 0.5967, "rewards/accuracies": 0.875, "rewards/chosen": 0.17861925065517426, "rewards/margins": 0.2286207377910614, "rewards/rejected": -0.050001464784145355, "step": 300 }, { "epoch": 0.66, "eval_logits/chosen": -1.8740853071212769, "eval_logits/rejected": -1.6454341411590576, "eval_logps/chosen": -128.13084411621094, "eval_logps/rejected": -150.56195068359375, "eval_loss": 0.6526921391487122, "eval_rewards/accuracies": 0.7159090638160706, "eval_rewards/chosen": 0.16582368314266205, "eval_rewards/margins": 0.09728217869997025, "eval_rewards/rejected": 0.06854148954153061, "eval_runtime": 64.2292, "eval_samples_per_second": 10.494, "eval_steps_per_second": 0.343, "step": 300 }, { "epoch": 0.69, "grad_norm": 5.769946207769061, "learning_rate": 1.3633004578800611e-07, "logits/chosen": -1.6252295970916748, "logits/rejected": -1.6870454549789429, "logps/chosen": -145.02096557617188, "logps/rejected": -175.39373779296875, "loss": 0.6066, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.1655764877796173, "rewards/margins": 0.19951465725898743, "rewards/rejected": -0.03393816202878952, "step": 310 }, { "epoch": 0.71, "grad_norm": 6.460474591534718, "learning_rate": 1.1945783928745184e-07, "logits/chosen": -1.5879411697387695, "logits/rejected": -1.6782830953598022, "logps/chosen": -148.7205810546875, "logps/rejected": -193.89002990722656, "loss": 0.6004, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.179766446352005, "rewards/margins": 0.21573027968406677, "rewards/rejected": -0.03596385195851326, "step": 320 }, { "epoch": 0.73, "grad_norm": 5.905172245521576, "learning_rate": 1.0336686792445423e-07, "logits/chosen": -1.6096925735473633, "logits/rejected": -1.7555859088897705, "logps/chosen": -141.56198120117188, "logps/rejected": -160.07009887695312, "loss": 0.599, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.1507815718650818, "rewards/margins": 0.19255796074867249, "rewards/rejected": -0.0417763814330101, "step": 330 }, { "epoch": 0.75, "grad_norm": 5.690958514404008, "learning_rate": 8.81534288045431e-08, "logits/chosen": -1.6023919582366943, "logits/rejected": -1.743801474571228, "logps/chosen": -155.75018310546875, "logps/rejected": -177.66357421875, "loss": 0.5925, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.18638750910758972, "rewards/margins": 0.23221752047538757, "rewards/rejected": -0.045830003917217255, "step": 340 }, { "epoch": 0.77, "grad_norm": 5.590062248471453, "learning_rate": 7.390856740405091e-08, "logits/chosen": -1.6454668045043945, "logits/rejected": -1.7857444286346436, "logps/chosen": -164.88294982910156, "logps/rejected": -179.4400177001953, "loss": 0.5915, "rewards/accuracies": 0.8125, "rewards/chosen": 0.16329999268054962, "rewards/margins": 0.2226523905992508, "rewards/rejected": -0.059352416545152664, "step": 350 }, { "epoch": 0.8, "grad_norm": 5.731203070847599, "learning_rate": 6.071753270457065e-08, "logits/chosen": -1.5961381196975708, "logits/rejected": -1.81313157081604, "logps/chosen": -150.7357940673828, "logps/rejected": -198.715087890625, "loss": 0.5961, "rewards/accuracies": 0.8125, "rewards/chosen": 0.13613273203372955, "rewards/margins": 0.21987445652484894, "rewards/rejected": -0.08374173194169998, "step": 360 }, { "epoch": 0.82, "grad_norm": 5.979073484319341, "learning_rate": 4.865926701678352e-08, "logits/chosen": -1.6725813150405884, "logits/rejected": -1.7875378131866455, "logps/chosen": -145.93125915527344, "logps/rejected": -173.05287170410156, "loss": 0.5926, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.1760322004556656, "rewards/margins": 0.2271731197834015, "rewards/rejected": -0.0511409156024456, "step": 370 }, { "epoch": 0.84, "grad_norm": 6.048552200178175, "learning_rate": 3.780593354682826e-08, "logits/chosen": -1.589115858078003, "logits/rejected": -1.8489913940429688, "logps/chosen": -151.71109008789062, "logps/rejected": -172.7513885498047, "loss": 0.6031, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.15712784230709076, "rewards/margins": 0.2085273712873459, "rewards/rejected": -0.05139952152967453, "step": 380 }, { "epoch": 0.86, "grad_norm": 6.206696499253566, "learning_rate": 2.8222484532511166e-08, "logits/chosen": -1.626539945602417, "logits/rejected": -1.8017622232437134, "logps/chosen": -152.2079620361328, "logps/rejected": -174.45358276367188, "loss": 0.5972, "rewards/accuracies": 0.84375, "rewards/chosen": 0.15285542607307434, "rewards/margins": 0.1900016814470291, "rewards/rejected": -0.037146251648664474, "step": 390 }, { "epoch": 0.88, "grad_norm": 6.415272732792148, "learning_rate": 1.9966272533864183e-08, "logits/chosen": -1.6374372243881226, "logits/rejected": -1.8020416498184204, "logps/chosen": -154.29025268554688, "logps/rejected": -180.31027221679688, "loss": 0.5979, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.17826148867607117, "rewards/margins": 0.23326411843299866, "rewards/rejected": -0.055002618581056595, "step": 400 }, { "epoch": 0.88, "eval_logits/chosen": -1.9158859252929688, "eval_logits/rejected": -1.6898193359375, "eval_logps/chosen": -129.27784729003906, "eval_logps/rejected": -152.41265869140625, "eval_loss": 0.6502383947372437, "eval_rewards/accuracies": 0.7272727489471436, "eval_rewards/chosen": 0.15435346961021423, "eval_rewards/margins": 0.10431905090808868, "eval_rewards/rejected": 0.05003440007567406, "eval_runtime": 64.2154, "eval_samples_per_second": 10.496, "eval_steps_per_second": 0.343, "step": 400 }, { "epoch": 0.91, "grad_norm": 5.862851313494248, "learning_rate": 1.3086707204299413e-08, "logits/chosen": -1.5973860025405884, "logits/rejected": -1.8493759632110596, "logps/chosen": -151.65713500976562, "logps/rejected": -169.72344970703125, "loss": 0.5886, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.13965830206871033, "rewards/margins": 0.21700701117515564, "rewards/rejected": -0.07734867930412292, "step": 410 }, { "epoch": 0.93, "grad_norm": 6.0290500426097875, "learning_rate": 7.624959596427145e-09, "logits/chosen": -1.6760833263397217, "logits/rejected": -1.7056286334991455, "logps/chosen": -150.13731384277344, "logps/rejected": -179.03042602539062, "loss": 0.5987, "rewards/accuracies": 0.84375, "rewards/chosen": 0.15684738755226135, "rewards/margins": 0.21405240893363953, "rewards/rejected": -0.057205021381378174, "step": 420 }, { "epoch": 0.95, "grad_norm": 5.566801847010683, "learning_rate": 3.6137157721330967e-09, "logits/chosen": -1.6355880498886108, "logits/rejected": -1.7391598224639893, "logps/chosen": -138.0072479248047, "logps/rejected": -193.86627197265625, "loss": 0.5925, "rewards/accuracies": 0.90625, "rewards/chosen": 0.18388640880584717, "rewards/margins": 0.2870241403579712, "rewards/rejected": -0.10313773155212402, "step": 430 }, { "epoch": 0.97, "grad_norm": 5.3200698426502, "learning_rate": 1.0769811914444204e-09, "logits/chosen": -1.5580041408538818, "logits/rejected": -1.690857172012329, "logps/chosen": -135.1139373779297, "logps/rejected": -164.4469757080078, "loss": 0.588, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.159897118806839, "rewards/margins": 0.23895928263664246, "rewards/rejected": -0.07906216382980347, "step": 440 }, { "epoch": 1.0, "grad_norm": 5.753301995096959, "learning_rate": 2.993705082879328e-11, "logits/chosen": -1.6066157817840576, "logits/rejected": -1.796430230140686, "logps/chosen": -155.62608337402344, "logps/rejected": -184.45956420898438, "loss": 0.5864, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.15883687138557434, "rewards/margins": 0.24033670127391815, "rewards/rejected": -0.0814998596906662, "step": 450 }, { "epoch": 1.0, "step": 452, "total_flos": 0.0, "train_loss": 0.6286900518214809, "train_runtime": 6365.7753, "train_samples_per_second": 4.541, "train_steps_per_second": 0.071 } ], "logging_steps": 10, "max_steps": 452, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }