zephyr-7b-dpo-lora / trainer_state.json
Jerry Ji
Model save
ad61df1
raw
history blame
47.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9994837377387713,
"eval_steps": 100,
"global_step": 968,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 5.154639175257731e-09,
"logits/chosen": -2.251229763031006,
"logits/rejected": -2.2295913696289062,
"logps/chosen": -269.52740478515625,
"logps/rejected": -240.59812927246094,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 5.154639175257731e-08,
"logits/chosen": -2.223740339279175,
"logits/rejected": -2.180643081665039,
"logps/chosen": -284.7340087890625,
"logps/rejected": -205.98194885253906,
"loss": 0.694,
"rewards/accuracies": 0.4305555522441864,
"rewards/chosen": -0.0006893649115227163,
"rewards/margins": 0.0007374237175099552,
"rewards/rejected": -0.0014267880469560623,
"step": 10
},
{
"epoch": 0.02,
"learning_rate": 1.0309278350515462e-07,
"logits/chosen": -2.33476185798645,
"logits/rejected": -2.2125375270843506,
"logps/chosen": -320.8204040527344,
"logps/rejected": -248.4267120361328,
"loss": 0.692,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.0003039050498045981,
"rewards/margins": 0.0023796656168997288,
"rewards/rejected": -0.0020757606253027916,
"step": 20
},
{
"epoch": 0.03,
"learning_rate": 1.5463917525773197e-07,
"logits/chosen": -2.339370012283325,
"logits/rejected": -2.304020404815674,
"logps/chosen": -268.95074462890625,
"logps/rejected": -227.067626953125,
"loss": 0.6921,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.0005883350968360901,
"rewards/margins": 0.002594549907371402,
"rewards/rejected": -0.0020062148105353117,
"step": 30
},
{
"epoch": 0.04,
"learning_rate": 2.0618556701030925e-07,
"logits/chosen": -2.3392791748046875,
"logits/rejected": -2.3300938606262207,
"logps/chosen": -308.5113220214844,
"logps/rejected": -253.8385467529297,
"loss": 0.6945,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.0024464379530400038,
"rewards/margins": -0.00025889737298712134,
"rewards/rejected": 0.0027053358498960733,
"step": 40
},
{
"epoch": 0.05,
"learning_rate": 2.5773195876288655e-07,
"logits/chosen": -2.251412868499756,
"logits/rejected": -2.2359275817871094,
"logps/chosen": -297.78375244140625,
"logps/rejected": -227.23556518554688,
"loss": 0.6922,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.0033915191888809204,
"rewards/margins": 0.0055986023508012295,
"rewards/rejected": -0.0022070836275815964,
"step": 50
},
{
"epoch": 0.06,
"learning_rate": 3.0927835051546394e-07,
"logits/chosen": -2.167163848876953,
"logits/rejected": -2.3376193046569824,
"logps/chosen": -256.54510498046875,
"logps/rejected": -229.5459747314453,
"loss": 0.6917,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.000388039683457464,
"rewards/margins": 0.007883811369538307,
"rewards/rejected": -0.0074957734905183315,
"step": 60
},
{
"epoch": 0.07,
"learning_rate": 3.608247422680412e-07,
"logits/chosen": -2.3430614471435547,
"logits/rejected": -2.281782627105713,
"logps/chosen": -313.92608642578125,
"logps/rejected": -252.57284545898438,
"loss": 0.6924,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.0012417413527145982,
"rewards/margins": 0.0001173208438558504,
"rewards/rejected": 0.0011244199704378843,
"step": 70
},
{
"epoch": 0.08,
"learning_rate": 4.123711340206185e-07,
"logits/chosen": -2.337070941925049,
"logits/rejected": -2.3018112182617188,
"logps/chosen": -302.9524841308594,
"logps/rejected": -243.9047088623047,
"loss": 0.6916,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.0021400884725153446,
"rewards/margins": -0.0002812549355439842,
"rewards/rejected": 0.002421343233436346,
"step": 80
},
{
"epoch": 0.09,
"learning_rate": 4.639175257731959e-07,
"logits/chosen": -2.259251356124878,
"logits/rejected": -2.2963995933532715,
"logps/chosen": -270.1668395996094,
"logps/rejected": -216.64822387695312,
"loss": 0.6913,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.009941437281668186,
"rewards/margins": 0.010241752490401268,
"rewards/rejected": -0.00030031436472199857,
"step": 90
},
{
"epoch": 0.1,
"learning_rate": 4.982778415614236e-07,
"logits/chosen": -2.1677582263946533,
"logits/rejected": -2.2741990089416504,
"logps/chosen": -274.75836181640625,
"logps/rejected": -226.3966064453125,
"loss": 0.6901,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.006115993484854698,
"rewards/margins": 0.0013887921813875437,
"rewards/rejected": 0.0047272020019590855,
"step": 100
},
{
"epoch": 0.11,
"learning_rate": 4.925373134328357e-07,
"logits/chosen": -2.271916389465332,
"logits/rejected": -2.197857141494751,
"logps/chosen": -274.72113037109375,
"logps/rejected": -232.5464324951172,
"loss": 0.6886,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.005831545684486628,
"rewards/margins": 0.0067709460854530334,
"rewards/rejected": -0.000939400284551084,
"step": 110
},
{
"epoch": 0.12,
"learning_rate": 4.867967853042479e-07,
"logits/chosen": -2.2548232078552246,
"logits/rejected": -2.322075366973877,
"logps/chosen": -319.34521484375,
"logps/rejected": -235.76535034179688,
"loss": 0.689,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.013832703232765198,
"rewards/margins": 0.01176449190825224,
"rewards/rejected": 0.002068211790174246,
"step": 120
},
{
"epoch": 0.13,
"learning_rate": 4.810562571756601e-07,
"logits/chosen": -2.32174015045166,
"logits/rejected": -2.3775150775909424,
"logps/chosen": -296.20733642578125,
"logps/rejected": -245.56655883789062,
"loss": 0.6875,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.017552796751260757,
"rewards/margins": 0.013545483350753784,
"rewards/rejected": 0.004007314797490835,
"step": 130
},
{
"epoch": 0.14,
"learning_rate": 4.753157290470723e-07,
"logits/chosen": -2.3627283573150635,
"logits/rejected": -2.310948133468628,
"logps/chosen": -301.9321594238281,
"logps/rejected": -239.2898406982422,
"loss": 0.688,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.011156091466546059,
"rewards/margins": 0.009668431244790554,
"rewards/rejected": 0.0014876595232635736,
"step": 140
},
{
"epoch": 0.15,
"learning_rate": 4.6957520091848447e-07,
"logits/chosen": -2.2531113624572754,
"logits/rejected": -2.348215341567993,
"logps/chosen": -284.4292907714844,
"logps/rejected": -259.6882019042969,
"loss": 0.6858,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.017186133190989494,
"rewards/margins": 0.011862866580486298,
"rewards/rejected": 0.005323265679180622,
"step": 150
},
{
"epoch": 0.17,
"learning_rate": 4.6383467278989666e-07,
"logits/chosen": -2.361238956451416,
"logits/rejected": -2.4430744647979736,
"logps/chosen": -286.7644348144531,
"logps/rejected": -221.6837158203125,
"loss": 0.6857,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.01911218836903572,
"rewards/margins": 0.014816234819591045,
"rewards/rejected": 0.00429595448076725,
"step": 160
},
{
"epoch": 0.18,
"learning_rate": 4.580941446613088e-07,
"logits/chosen": -2.32244610786438,
"logits/rejected": -2.3339757919311523,
"logps/chosen": -301.54693603515625,
"logps/rejected": -239.26095581054688,
"loss": 0.6839,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.023171866312623024,
"rewards/margins": 0.014685508795082569,
"rewards/rejected": 0.00848635844886303,
"step": 170
},
{
"epoch": 0.19,
"learning_rate": 4.52353616532721e-07,
"logits/chosen": -2.347285032272339,
"logits/rejected": -2.3244121074676514,
"logps/chosen": -257.841552734375,
"logps/rejected": -214.5565643310547,
"loss": 0.6864,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.019994111731648445,
"rewards/margins": 0.01520625315606594,
"rewards/rejected": 0.004787858575582504,
"step": 180
},
{
"epoch": 0.2,
"learning_rate": 4.4661308840413316e-07,
"logits/chosen": -2.2657313346862793,
"logits/rejected": -2.201254367828369,
"logps/chosen": -253.98916625976562,
"logps/rejected": -206.3340301513672,
"loss": 0.6833,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.0245305635035038,
"rewards/margins": 0.017677443102002144,
"rewards/rejected": 0.006853120867162943,
"step": 190
},
{
"epoch": 0.21,
"learning_rate": 4.408725602755453e-07,
"logits/chosen": -2.284461498260498,
"logits/rejected": -2.2873706817626953,
"logps/chosen": -261.44427490234375,
"logps/rejected": -195.59422302246094,
"loss": 0.6835,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.03187788277864456,
"rewards/margins": 0.024095263332128525,
"rewards/rejected": 0.007782619446516037,
"step": 200
},
{
"epoch": 0.22,
"learning_rate": 4.351320321469575e-07,
"logits/chosen": -2.18426513671875,
"logits/rejected": -2.1963071823120117,
"logps/chosen": -302.31195068359375,
"logps/rejected": -218.6005401611328,
"loss": 0.6815,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.03718667849898338,
"rewards/margins": 0.026892077177762985,
"rewards/rejected": 0.010294605046510696,
"step": 210
},
{
"epoch": 0.23,
"learning_rate": 4.2939150401836967e-07,
"logits/chosen": -2.2150394916534424,
"logits/rejected": -2.2160990238189697,
"logps/chosen": -269.44769287109375,
"logps/rejected": -235.6748504638672,
"loss": 0.6801,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.038056183606386185,
"rewards/margins": 0.023441683501005173,
"rewards/rejected": 0.014614498242735863,
"step": 220
},
{
"epoch": 0.24,
"learning_rate": 4.236509758897818e-07,
"logits/chosen": -2.2152469158172607,
"logits/rejected": -2.1862380504608154,
"logps/chosen": -271.4049377441406,
"logps/rejected": -242.6397247314453,
"loss": 0.6826,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.03268683701753616,
"rewards/margins": 0.026912549510598183,
"rewards/rejected": 0.0057742842473089695,
"step": 230
},
{
"epoch": 0.25,
"learning_rate": 4.17910447761194e-07,
"logits/chosen": -2.3059380054473877,
"logits/rejected": -2.2681984901428223,
"logps/chosen": -309.55499267578125,
"logps/rejected": -221.61703491210938,
"loss": 0.6827,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.03509462997317314,
"rewards/margins": 0.012767216190695763,
"rewards/rejected": 0.02232741378247738,
"step": 240
},
{
"epoch": 0.26,
"learning_rate": 4.121699196326062e-07,
"logits/chosen": -2.307035446166992,
"logits/rejected": -2.2920923233032227,
"logps/chosen": -272.9412841796875,
"logps/rejected": -237.314208984375,
"loss": 0.6824,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.03535359352827072,
"rewards/margins": 0.012216273695230484,
"rewards/rejected": 0.023137323558330536,
"step": 250
},
{
"epoch": 0.27,
"learning_rate": 4.0642939150401836e-07,
"logits/chosen": -2.3456673622131348,
"logits/rejected": -2.3194832801818848,
"logps/chosen": -270.475341796875,
"logps/rejected": -221.84536743164062,
"loss": 0.6805,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.04569912329316139,
"rewards/margins": 0.029975151643157005,
"rewards/rejected": 0.015723969787359238,
"step": 260
},
{
"epoch": 0.28,
"learning_rate": 4.006888633754305e-07,
"logits/chosen": -2.385854721069336,
"logits/rejected": -2.3556528091430664,
"logps/chosen": -284.36029052734375,
"logps/rejected": -232.5426788330078,
"loss": 0.6793,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.04998317360877991,
"rewards/margins": 0.032010577619075775,
"rewards/rejected": 0.017972594127058983,
"step": 270
},
{
"epoch": 0.29,
"learning_rate": 3.949483352468427e-07,
"logits/chosen": -2.308225154876709,
"logits/rejected": -2.259629726409912,
"logps/chosen": -293.1715087890625,
"logps/rejected": -236.4293975830078,
"loss": 0.6771,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.057786036282777786,
"rewards/margins": 0.04149205610156059,
"rewards/rejected": 0.016293983906507492,
"step": 280
},
{
"epoch": 0.3,
"learning_rate": 3.8920780711825487e-07,
"logits/chosen": -2.278501033782959,
"logits/rejected": -2.369293689727783,
"logps/chosen": -278.4786376953125,
"logps/rejected": -227.40927124023438,
"loss": 0.6792,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0485750176012516,
"rewards/margins": 0.02242155373096466,
"rewards/rejected": 0.02615346387028694,
"step": 290
},
{
"epoch": 0.31,
"learning_rate": 3.83467278989667e-07,
"logits/chosen": -2.2661235332489014,
"logits/rejected": -2.205644130706787,
"logps/chosen": -254.183837890625,
"logps/rejected": -221.9667510986328,
"loss": 0.6772,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.05874975398182869,
"rewards/margins": 0.03965791314840317,
"rewards/rejected": 0.019091838970780373,
"step": 300
},
{
"epoch": 0.32,
"learning_rate": 3.777267508610792e-07,
"logits/chosen": -2.32353138923645,
"logits/rejected": -2.3743112087249756,
"logps/chosen": -306.22711181640625,
"logps/rejected": -257.60980224609375,
"loss": 0.6783,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.04823786020278931,
"rewards/margins": 0.017192820087075233,
"rewards/rejected": 0.03104504384100437,
"step": 310
},
{
"epoch": 0.33,
"learning_rate": 3.7198622273249137e-07,
"logits/chosen": -2.234679698944092,
"logits/rejected": -2.211430788040161,
"logps/chosen": -251.83053588867188,
"logps/rejected": -193.01544189453125,
"loss": 0.6739,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.06583289802074432,
"rewards/margins": 0.047706056386232376,
"rewards/rejected": 0.018126841634511948,
"step": 320
},
{
"epoch": 0.34,
"learning_rate": 3.662456946039035e-07,
"logits/chosen": -2.259127140045166,
"logits/rejected": -2.287956714630127,
"logps/chosen": -312.1918029785156,
"logps/rejected": -239.03530883789062,
"loss": 0.6761,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.07068151980638504,
"rewards/margins": 0.051512353122234344,
"rewards/rejected": 0.0191691592335701,
"step": 330
},
{
"epoch": 0.35,
"learning_rate": 3.605051664753157e-07,
"logits/chosen": -2.197277784347534,
"logits/rejected": -2.13037109375,
"logps/chosen": -244.2609100341797,
"logps/rejected": -238.80953979492188,
"loss": 0.6788,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.05732797831296921,
"rewards/margins": 0.030042264610528946,
"rewards/rejected": 0.027285713702440262,
"step": 340
},
{
"epoch": 0.36,
"learning_rate": 3.547646383467279e-07,
"logits/chosen": -2.365830421447754,
"logits/rejected": -2.3728528022766113,
"logps/chosen": -313.7022705078125,
"logps/rejected": -248.090087890625,
"loss": 0.6746,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.08016298711299896,
"rewards/margins": 0.05509548634290695,
"rewards/rejected": 0.025067497044801712,
"step": 350
},
{
"epoch": 0.37,
"learning_rate": 3.4902411021814007e-07,
"logits/chosen": -2.22756290435791,
"logits/rejected": -2.259359121322632,
"logps/chosen": -303.25250244140625,
"logps/rejected": -249.8985595703125,
"loss": 0.6723,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.06414168328046799,
"rewards/margins": 0.04363773763179779,
"rewards/rejected": 0.020503941923379898,
"step": 360
},
{
"epoch": 0.38,
"learning_rate": 3.432835820895522e-07,
"logits/chosen": -2.3700273036956787,
"logits/rejected": -2.3231639862060547,
"logps/chosen": -314.5257263183594,
"logps/rejected": -270.7105712890625,
"loss": 0.6759,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.07061124593019485,
"rewards/margins": 0.03391130641102791,
"rewards/rejected": 0.03669993579387665,
"step": 370
},
{
"epoch": 0.39,
"learning_rate": 3.375430539609644e-07,
"logits/chosen": -2.3212878704071045,
"logits/rejected": -2.249602794647217,
"logps/chosen": -291.92474365234375,
"logps/rejected": -239.6724395751953,
"loss": 0.677,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.07373902946710587,
"rewards/margins": 0.03367278352379799,
"rewards/rejected": 0.04006624594330788,
"step": 380
},
{
"epoch": 0.4,
"learning_rate": 3.3180252583237657e-07,
"logits/chosen": -2.297023057937622,
"logits/rejected": -2.264172077178955,
"logps/chosen": -278.0927734375,
"logps/rejected": -237.13436889648438,
"loss": 0.6722,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.0686158686876297,
"rewards/margins": 0.051144860684871674,
"rewards/rejected": 0.01747100241482258,
"step": 390
},
{
"epoch": 0.41,
"learning_rate": 3.260619977037887e-07,
"logits/chosen": -2.237035036087036,
"logits/rejected": -2.2392399311065674,
"logps/chosen": -263.4399108886719,
"logps/rejected": -213.87451171875,
"loss": 0.6707,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.06768475472927094,
"rewards/margins": 0.048441771417856216,
"rewards/rejected": 0.019242987036705017,
"step": 400
},
{
"epoch": 0.42,
"learning_rate": 3.203214695752009e-07,
"logits/chosen": -2.2776081562042236,
"logits/rejected": -2.2924447059631348,
"logps/chosen": -268.8953857421875,
"logps/rejected": -252.852294921875,
"loss": 0.6673,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.08111406862735748,
"rewards/margins": 0.05318716913461685,
"rewards/rejected": 0.027926897630095482,
"step": 410
},
{
"epoch": 0.43,
"learning_rate": 3.145809414466131e-07,
"logits/chosen": -2.3054046630859375,
"logits/rejected": -2.2502362728118896,
"logps/chosen": -252.5205841064453,
"logps/rejected": -204.43344116210938,
"loss": 0.6749,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.07272285223007202,
"rewards/margins": 0.04809904843568802,
"rewards/rejected": 0.024623800069093704,
"step": 420
},
{
"epoch": 0.44,
"learning_rate": 3.0884041331802526e-07,
"logits/chosen": -2.3482632637023926,
"logits/rejected": -2.3258707523345947,
"logps/chosen": -263.67095947265625,
"logps/rejected": -241.14047241210938,
"loss": 0.6741,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.07022975385189056,
"rewards/margins": 0.04051927849650383,
"rewards/rejected": 0.029710477218031883,
"step": 430
},
{
"epoch": 0.45,
"learning_rate": 3.030998851894374e-07,
"logits/chosen": -2.286533832550049,
"logits/rejected": -2.320568084716797,
"logps/chosen": -286.72894287109375,
"logps/rejected": -247.65542602539062,
"loss": 0.6705,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.07666246592998505,
"rewards/margins": 0.05972421169281006,
"rewards/rejected": 0.01693824864923954,
"step": 440
},
{
"epoch": 0.46,
"learning_rate": 2.973593570608496e-07,
"logits/chosen": -2.206477642059326,
"logits/rejected": -2.315464496612549,
"logps/chosen": -276.1682434082031,
"logps/rejected": -230.3959197998047,
"loss": 0.678,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.0718303695321083,
"rewards/margins": 0.04074189439415932,
"rewards/rejected": 0.03108847141265869,
"step": 450
},
{
"epoch": 0.47,
"learning_rate": 2.9161882893226177e-07,
"logits/chosen": -2.277815103530884,
"logits/rejected": -2.342268705368042,
"logps/chosen": -273.23773193359375,
"logps/rejected": -222.5966796875,
"loss": 0.6662,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.0802597850561142,
"rewards/margins": 0.050464123487472534,
"rewards/rejected": 0.029795657843351364,
"step": 460
},
{
"epoch": 0.49,
"learning_rate": 2.858783008036739e-07,
"logits/chosen": -2.2656216621398926,
"logits/rejected": -2.2778594493865967,
"logps/chosen": -248.9929656982422,
"logps/rejected": -215.5894012451172,
"loss": 0.6669,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.08564073592424393,
"rewards/margins": 0.06490761041641235,
"rewards/rejected": 0.020733121782541275,
"step": 470
},
{
"epoch": 0.5,
"learning_rate": 2.801377726750861e-07,
"logits/chosen": -2.2962255477905273,
"logits/rejected": -2.27239727973938,
"logps/chosen": -289.5277404785156,
"logps/rejected": -231.601318359375,
"loss": 0.6713,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.08144901692867279,
"rewards/margins": 0.05658548325300217,
"rewards/rejected": 0.024863524362444878,
"step": 480
},
{
"epoch": 0.51,
"learning_rate": 2.743972445464983e-07,
"logits/chosen": -2.445746660232544,
"logits/rejected": -2.267007827758789,
"logps/chosen": -293.1885986328125,
"logps/rejected": -243.8875274658203,
"loss": 0.6676,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.10828351974487305,
"rewards/margins": 0.08175922185182571,
"rewards/rejected": 0.02652430161833763,
"step": 490
},
{
"epoch": 0.52,
"learning_rate": 2.686567164179104e-07,
"logits/chosen": -2.278276205062866,
"logits/rejected": -2.295633316040039,
"logps/chosen": -254.94760131835938,
"logps/rejected": -221.79452514648438,
"loss": 0.6672,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.08227074891328812,
"rewards/margins": 0.055896710604429245,
"rewards/rejected": 0.026374032720923424,
"step": 500
},
{
"epoch": 0.53,
"learning_rate": 2.629161882893226e-07,
"logits/chosen": -2.202611207962036,
"logits/rejected": -2.2495861053466797,
"logps/chosen": -310.4443664550781,
"logps/rejected": -256.72406005859375,
"loss": 0.6666,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.07021793723106384,
"rewards/margins": 0.040728576481342316,
"rewards/rejected": 0.02948935702443123,
"step": 510
},
{
"epoch": 0.54,
"learning_rate": 2.571756601607348e-07,
"logits/chosen": -2.3376307487487793,
"logits/rejected": -2.352074146270752,
"logps/chosen": -278.10504150390625,
"logps/rejected": -244.0722198486328,
"loss": 0.6697,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0925985723733902,
"rewards/margins": 0.0637633204460144,
"rewards/rejected": 0.028835251927375793,
"step": 520
},
{
"epoch": 0.55,
"learning_rate": 2.5143513203214697e-07,
"logits/chosen": -2.243332624435425,
"logits/rejected": -2.2513413429260254,
"logps/chosen": -242.59439086914062,
"logps/rejected": -224.13259887695312,
"loss": 0.6716,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.07866770029067993,
"rewards/margins": 0.057711243629455566,
"rewards/rejected": 0.020956454798579216,
"step": 530
},
{
"epoch": 0.56,
"learning_rate": 2.456946039035591e-07,
"logits/chosen": -2.300567150115967,
"logits/rejected": -2.271827220916748,
"logps/chosen": -288.2174377441406,
"logps/rejected": -240.34439086914062,
"loss": 0.6682,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.10411250591278076,
"rewards/margins": 0.05851038545370102,
"rewards/rejected": 0.04560210928320885,
"step": 540
},
{
"epoch": 0.57,
"learning_rate": 2.399540757749713e-07,
"logits/chosen": -2.3359756469726562,
"logits/rejected": -2.194058895111084,
"logps/chosen": -265.052001953125,
"logps/rejected": -230.23605346679688,
"loss": 0.6686,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0775262787938118,
"rewards/margins": 0.05575944110751152,
"rewards/rejected": 0.021766824647784233,
"step": 550
},
{
"epoch": 0.58,
"learning_rate": 2.3421354764638345e-07,
"logits/chosen": -2.3195242881774902,
"logits/rejected": -2.283975124359131,
"logps/chosen": -302.0104064941406,
"logps/rejected": -252.0124053955078,
"loss": 0.6708,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.10010389983654022,
"rewards/margins": 0.053703296929597855,
"rewards/rejected": 0.04640059918165207,
"step": 560
},
{
"epoch": 0.59,
"learning_rate": 2.2847301951779563e-07,
"logits/chosen": -2.2481091022491455,
"logits/rejected": -2.400871515274048,
"logps/chosen": -268.6519775390625,
"logps/rejected": -223.69882202148438,
"loss": 0.6654,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0826568529009819,
"rewards/margins": 0.05431235954165459,
"rewards/rejected": 0.028344491496682167,
"step": 570
},
{
"epoch": 0.6,
"learning_rate": 2.227324913892078e-07,
"logits/chosen": -2.299408197402954,
"logits/rejected": -2.22338604927063,
"logps/chosen": -299.3912353515625,
"logps/rejected": -236.9815216064453,
"loss": 0.661,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.10458721220493317,
"rewards/margins": 0.08465038239955902,
"rewards/rejected": 0.019936833530664444,
"step": 580
},
{
"epoch": 0.61,
"learning_rate": 2.1699196326061998e-07,
"logits/chosen": -2.2584633827209473,
"logits/rejected": -2.2311649322509766,
"logps/chosen": -253.76913452148438,
"logps/rejected": -218.6166534423828,
"loss": 0.6687,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.07234074175357819,
"rewards/margins": 0.04758009687066078,
"rewards/rejected": 0.024760644882917404,
"step": 590
},
{
"epoch": 0.62,
"learning_rate": 2.1125143513203214e-07,
"logits/chosen": -2.318943738937378,
"logits/rejected": -2.2511682510375977,
"logps/chosen": -256.5652770996094,
"logps/rejected": -206.35586547851562,
"loss": 0.669,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.07542125880718231,
"rewards/margins": 0.0553053617477417,
"rewards/rejected": 0.020115893334150314,
"step": 600
},
{
"epoch": 0.63,
"learning_rate": 2.055109070034443e-07,
"logits/chosen": -2.3058714866638184,
"logits/rejected": -2.304198741912842,
"logps/chosen": -266.4674987792969,
"logps/rejected": -223.82711791992188,
"loss": 0.6677,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.09824246913194656,
"rewards/margins": 0.06738617271184921,
"rewards/rejected": 0.03085630014538765,
"step": 610
},
{
"epoch": 0.64,
"learning_rate": 1.997703788748565e-07,
"logits/chosen": -2.337787389755249,
"logits/rejected": -2.2819180488586426,
"logps/chosen": -313.7826232910156,
"logps/rejected": -249.5704803466797,
"loss": 0.6582,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.10966908931732178,
"rewards/margins": 0.08016980439424515,
"rewards/rejected": 0.029499292373657227,
"step": 620
},
{
"epoch": 0.65,
"learning_rate": 1.9402985074626865e-07,
"logits/chosen": -2.2067112922668457,
"logits/rejected": -2.246953010559082,
"logps/chosen": -259.2144775390625,
"logps/rejected": -240.3810272216797,
"loss": 0.6653,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.09941162168979645,
"rewards/margins": 0.06417630612850189,
"rewards/rejected": 0.035235337913036346,
"step": 630
},
{
"epoch": 0.66,
"learning_rate": 1.8828932261768083e-07,
"logits/chosen": -2.2894420623779297,
"logits/rejected": -2.2385382652282715,
"logps/chosen": -266.48992919921875,
"logps/rejected": -217.8952178955078,
"loss": 0.661,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.095299132168293,
"rewards/margins": 0.07987986505031586,
"rewards/rejected": 0.01541926246136427,
"step": 640
},
{
"epoch": 0.67,
"learning_rate": 1.82548794489093e-07,
"logits/chosen": -2.33485746383667,
"logits/rejected": -2.3108019828796387,
"logps/chosen": -284.7020568847656,
"logps/rejected": -232.82080078125,
"loss": 0.664,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.10341651737689972,
"rewards/margins": 0.07464977353811264,
"rewards/rejected": 0.028766745701432228,
"step": 650
},
{
"epoch": 0.68,
"learning_rate": 1.7680826636050515e-07,
"logits/chosen": -2.3347816467285156,
"logits/rejected": -2.2758853435516357,
"logps/chosen": -279.80059814453125,
"logps/rejected": -233.2425994873047,
"loss": 0.6608,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.11068934202194214,
"rewards/margins": 0.07695071399211884,
"rewards/rejected": 0.0337386280298233,
"step": 660
},
{
"epoch": 0.69,
"learning_rate": 1.7106773823191734e-07,
"logits/chosen": -2.2854952812194824,
"logits/rejected": -2.273536205291748,
"logps/chosen": -295.6964416503906,
"logps/rejected": -240.4071502685547,
"loss": 0.6615,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.1013779416680336,
"rewards/margins": 0.060683172196149826,
"rewards/rejected": 0.04069476202130318,
"step": 670
},
{
"epoch": 0.7,
"learning_rate": 1.653272101033295e-07,
"logits/chosen": -2.34243106842041,
"logits/rejected": -2.2720611095428467,
"logps/chosen": -289.71722412109375,
"logps/rejected": -230.321533203125,
"loss": 0.6729,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.09767869859933853,
"rewards/margins": 0.039280109107494354,
"rewards/rejected": 0.05839858204126358,
"step": 680
},
{
"epoch": 0.71,
"learning_rate": 1.5958668197474169e-07,
"logits/chosen": -2.371598482131958,
"logits/rejected": -2.362656354904175,
"logps/chosen": -268.17828369140625,
"logps/rejected": -229.41232299804688,
"loss": 0.6659,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0969640463590622,
"rewards/margins": 0.06369610875844955,
"rewards/rejected": 0.033267926424741745,
"step": 690
},
{
"epoch": 0.72,
"learning_rate": 1.5384615384615385e-07,
"logits/chosen": -2.2588796615600586,
"logits/rejected": -2.2576823234558105,
"logps/chosen": -282.4342041015625,
"logps/rejected": -222.56381225585938,
"loss": 0.664,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.10399500280618668,
"rewards/margins": 0.08138440549373627,
"rewards/rejected": 0.0226106159389019,
"step": 700
},
{
"epoch": 0.73,
"learning_rate": 1.4810562571756603e-07,
"logits/chosen": -2.3341283798217773,
"logits/rejected": -2.2046780586242676,
"logps/chosen": -272.2647399902344,
"logps/rejected": -208.01364135742188,
"loss": 0.666,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.10669133812189102,
"rewards/margins": 0.08235933631658554,
"rewards/rejected": 0.02433200553059578,
"step": 710
},
{
"epoch": 0.74,
"learning_rate": 1.423650975889782e-07,
"logits/chosen": -2.323979139328003,
"logits/rejected": -2.340238094329834,
"logps/chosen": -303.2074279785156,
"logps/rejected": -259.44268798828125,
"loss": 0.6667,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.11533965170383453,
"rewards/margins": 0.047552816569805145,
"rewards/rejected": 0.06778682768344879,
"step": 720
},
{
"epoch": 0.75,
"learning_rate": 1.3662456946039035e-07,
"logits/chosen": -2.3031513690948486,
"logits/rejected": -2.28584623336792,
"logps/chosen": -270.1670837402344,
"logps/rejected": -252.5519256591797,
"loss": 0.6642,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.10461707413196564,
"rewards/margins": 0.058367032557725906,
"rewards/rejected": 0.04625004902482033,
"step": 730
},
{
"epoch": 0.76,
"learning_rate": 1.3088404133180254e-07,
"logits/chosen": -2.2157022953033447,
"logits/rejected": -2.2670745849609375,
"logps/chosen": -276.71240234375,
"logps/rejected": -199.2496795654297,
"loss": 0.6635,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.11176248639822006,
"rewards/margins": 0.08353973925113678,
"rewards/rejected": 0.02822275087237358,
"step": 740
},
{
"epoch": 0.77,
"learning_rate": 1.251435132032147e-07,
"logits/chosen": -2.2043914794921875,
"logits/rejected": -2.221619129180908,
"logps/chosen": -269.0702819824219,
"logps/rejected": -220.8921356201172,
"loss": 0.665,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.09922349452972412,
"rewards/margins": 0.04318443313241005,
"rewards/rejected": 0.05603905767202377,
"step": 750
},
{
"epoch": 0.78,
"learning_rate": 1.1940298507462686e-07,
"logits/chosen": -2.232959270477295,
"logits/rejected": -2.2529525756835938,
"logps/chosen": -267.9338684082031,
"logps/rejected": -249.4876251220703,
"loss": 0.6684,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.08004304021596909,
"rewards/margins": 0.04949140548706055,
"rewards/rejected": 0.030551627278327942,
"step": 760
},
{
"epoch": 0.8,
"learning_rate": 1.1366245694603903e-07,
"logits/chosen": -2.293257236480713,
"logits/rejected": -2.2078585624694824,
"logps/chosen": -273.19671630859375,
"logps/rejected": -238.57858276367188,
"loss": 0.661,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.11353409290313721,
"rewards/margins": 0.06645722687244415,
"rewards/rejected": 0.04707685858011246,
"step": 770
},
{
"epoch": 0.81,
"learning_rate": 1.079219288174512e-07,
"logits/chosen": -2.3507869243621826,
"logits/rejected": -2.325718879699707,
"logps/chosen": -290.9693298339844,
"logps/rejected": -236.1486358642578,
"loss": 0.6633,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.0980958342552185,
"rewards/margins": 0.07181811332702637,
"rewards/rejected": 0.026277724653482437,
"step": 780
},
{
"epoch": 0.82,
"learning_rate": 1.0218140068886336e-07,
"logits/chosen": -2.268038272857666,
"logits/rejected": -2.286581516265869,
"logps/chosen": -270.3387451171875,
"logps/rejected": -221.06356811523438,
"loss": 0.6564,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.12088136374950409,
"rewards/margins": 0.080001600086689,
"rewards/rejected": 0.040879763662815094,
"step": 790
},
{
"epoch": 0.83,
"learning_rate": 9.644087256027554e-08,
"logits/chosen": -2.272735118865967,
"logits/rejected": -2.2941083908081055,
"logps/chosen": -284.6488952636719,
"logps/rejected": -243.56796264648438,
"loss": 0.6639,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.1113913282752037,
"rewards/margins": 0.05327050760388374,
"rewards/rejected": 0.05812082439661026,
"step": 800
},
{
"epoch": 0.84,
"learning_rate": 9.070034443168771e-08,
"logits/chosen": -2.2838375568389893,
"logits/rejected": -2.289247751235962,
"logps/chosen": -269.5845642089844,
"logps/rejected": -230.6207275390625,
"loss": 0.6617,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.09149408340454102,
"rewards/margins": 0.06341233849525452,
"rewards/rejected": 0.02808173932135105,
"step": 810
},
{
"epoch": 0.85,
"learning_rate": 8.495981630309988e-08,
"logits/chosen": -2.365980863571167,
"logits/rejected": -2.3436598777770996,
"logps/chosen": -302.0718688964844,
"logps/rejected": -228.1407470703125,
"loss": 0.6623,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.13062262535095215,
"rewards/margins": 0.08858474344015121,
"rewards/rejected": 0.04203786700963974,
"step": 820
},
{
"epoch": 0.86,
"learning_rate": 7.921928817451206e-08,
"logits/chosen": -2.342413902282715,
"logits/rejected": -2.2254080772399902,
"logps/chosen": -287.4922180175781,
"logps/rejected": -222.5606231689453,
"loss": 0.6565,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.12904855608940125,
"rewards/margins": 0.08615640550851822,
"rewards/rejected": 0.04289213940501213,
"step": 830
},
{
"epoch": 0.87,
"learning_rate": 7.347876004592423e-08,
"logits/chosen": -2.259397029876709,
"logits/rejected": -2.227036476135254,
"logps/chosen": -258.3423767089844,
"logps/rejected": -216.99606323242188,
"loss": 0.6714,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.10358164459466934,
"rewards/margins": 0.06773830950260162,
"rewards/rejected": 0.03584333881735802,
"step": 840
},
{
"epoch": 0.88,
"learning_rate": 6.773823191733639e-08,
"logits/chosen": -2.2834537029266357,
"logits/rejected": -2.3872971534729004,
"logps/chosen": -262.05084228515625,
"logps/rejected": -231.11306762695312,
"loss": 0.6647,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.09495140612125397,
"rewards/margins": 0.055265575647354126,
"rewards/rejected": 0.03968583419919014,
"step": 850
},
{
"epoch": 0.89,
"learning_rate": 6.199770378874856e-08,
"logits/chosen": -2.4065003395080566,
"logits/rejected": -2.3337345123291016,
"logps/chosen": -295.71478271484375,
"logps/rejected": -270.1822814941406,
"loss": 0.6693,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.11348612606525421,
"rewards/margins": 0.07466179132461548,
"rewards/rejected": 0.03882431983947754,
"step": 860
},
{
"epoch": 0.9,
"learning_rate": 5.6257175660160735e-08,
"logits/chosen": -2.2463555335998535,
"logits/rejected": -2.2443947792053223,
"logps/chosen": -312.9588317871094,
"logps/rejected": -237.4109344482422,
"loss": 0.6644,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.10128283500671387,
"rewards/margins": 0.053178369998931885,
"rewards/rejected": 0.04810447618365288,
"step": 870
},
{
"epoch": 0.91,
"learning_rate": 5.05166475315729e-08,
"logits/chosen": -2.358501434326172,
"logits/rejected": -2.313483715057373,
"logps/chosen": -291.43377685546875,
"logps/rejected": -240.09054565429688,
"loss": 0.6632,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.10742716491222382,
"rewards/margins": 0.07204015552997589,
"rewards/rejected": 0.03538701683282852,
"step": 880
},
{
"epoch": 0.92,
"learning_rate": 4.477611940298507e-08,
"logits/chosen": -2.313149929046631,
"logits/rejected": -2.3558261394500732,
"logps/chosen": -285.90643310546875,
"logps/rejected": -235.43051147460938,
"loss": 0.6666,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.12259715795516968,
"rewards/margins": 0.09698096662759781,
"rewards/rejected": 0.02561618760228157,
"step": 890
},
{
"epoch": 0.93,
"learning_rate": 3.903559127439724e-08,
"logits/chosen": -2.3278651237487793,
"logits/rejected": -2.195068836212158,
"logps/chosen": -272.7381896972656,
"logps/rejected": -211.40640258789062,
"loss": 0.658,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.1207551583647728,
"rewards/margins": 0.09316142648458481,
"rewards/rejected": 0.027593741193413734,
"step": 900
},
{
"epoch": 0.94,
"learning_rate": 3.3295063145809414e-08,
"logits/chosen": -2.290696859359741,
"logits/rejected": -2.3440823554992676,
"logps/chosen": -238.2651824951172,
"logps/rejected": -206.77969360351562,
"loss": 0.6616,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.09928463399410248,
"rewards/margins": 0.07226204872131348,
"rewards/rejected": 0.027022594586014748,
"step": 910
},
{
"epoch": 0.95,
"learning_rate": 2.755453501722158e-08,
"logits/chosen": -2.375807762145996,
"logits/rejected": -2.367743730545044,
"logps/chosen": -281.56195068359375,
"logps/rejected": -225.125244140625,
"loss": 0.662,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.1072310209274292,
"rewards/margins": 0.056608647108078,
"rewards/rejected": 0.050622373819351196,
"step": 920
},
{
"epoch": 0.96,
"learning_rate": 2.1814006888633754e-08,
"logits/chosen": -2.281919002532959,
"logits/rejected": -2.254122734069824,
"logps/chosen": -256.39105224609375,
"logps/rejected": -203.3081817626953,
"loss": 0.6617,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.11211923509836197,
"rewards/margins": 0.07925260812044144,
"rewards/rejected": 0.03286661207675934,
"step": 930
},
{
"epoch": 0.97,
"learning_rate": 1.6073478760045924e-08,
"logits/chosen": -2.316282272338867,
"logits/rejected": -2.3123340606689453,
"logps/chosen": -271.6207580566406,
"logps/rejected": -231.7317352294922,
"loss": 0.6626,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.10637687146663666,
"rewards/margins": 0.06768520176410675,
"rewards/rejected": 0.0386916846036911,
"step": 940
},
{
"epoch": 0.98,
"learning_rate": 1.0332950631458094e-08,
"logits/chosen": -2.3146958351135254,
"logits/rejected": -2.2793381214141846,
"logps/chosen": -282.83270263671875,
"logps/rejected": -233.0804443359375,
"loss": 0.6612,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.11455857753753662,
"rewards/margins": 0.0838586837053299,
"rewards/rejected": 0.030699897557497025,
"step": 950
},
{
"epoch": 0.99,
"learning_rate": 4.592422502870264e-09,
"logits/chosen": -2.251638889312744,
"logits/rejected": -2.234907627105713,
"logps/chosen": -281.0075378417969,
"logps/rejected": -239.98049926757812,
"loss": 0.661,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.1062885969877243,
"rewards/margins": 0.06708581745624542,
"rewards/rejected": 0.03920278698205948,
"step": 960
},
{
"epoch": 1.0,
"eval_logits/chosen": -2.4597132205963135,
"eval_logits/rejected": -2.398695468902588,
"eval_logps/chosen": -278.69171142578125,
"eval_logps/rejected": -230.4560089111328,
"eval_loss": 0.6642152070999146,
"eval_rewards/accuracies": 0.6480000019073486,
"eval_rewards/chosen": 0.10415761172771454,
"eval_rewards/margins": 0.06405296921730042,
"eval_rewards/rejected": 0.04010463133454323,
"eval_runtime": 443.9432,
"eval_samples_per_second": 4.505,
"eval_steps_per_second": 0.282,
"step": 968
},
{
"epoch": 1.0,
"step": 968,
"total_flos": 0.0,
"train_loss": 0.6728762634529555,
"train_runtime": 27528.1814,
"train_samples_per_second": 2.251,
"train_steps_per_second": 0.035
}
],
"logging_steps": 10,
"max_steps": 968,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 0.0,
"trial_name": null,
"trial_params": null
}