Zhihu_Qwen2 / output /checkpoint-3 /trainer_state.json
XiangJinYu's picture
Upload 31 files
182ce4c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9149207505920933,
"eval_steps": 500,
"global_step": 8000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0036436509382401167,
"grad_norm": 0.6875,
"learning_rate": 9.987852283770651e-05,
"loss": 3.4902,
"step": 10
},
{
"epoch": 0.007287301876480233,
"grad_norm": 0.66796875,
"learning_rate": 9.975704567541302e-05,
"loss": 3.3432,
"step": 20
},
{
"epoch": 0.01093095281472035,
"grad_norm": 0.5546875,
"learning_rate": 9.963556851311953e-05,
"loss": 3.2381,
"step": 30
},
{
"epoch": 0.014574603752960467,
"grad_norm": 0.65234375,
"learning_rate": 9.951409135082604e-05,
"loss": 3.2931,
"step": 40
},
{
"epoch": 0.018218254691200583,
"grad_norm": 0.6328125,
"learning_rate": 9.939261418853257e-05,
"loss": 3.3235,
"step": 50
},
{
"epoch": 0.0218619056294407,
"grad_norm": 0.64453125,
"learning_rate": 9.927113702623908e-05,
"loss": 3.2988,
"step": 60
},
{
"epoch": 0.025505556567680818,
"grad_norm": 0.59765625,
"learning_rate": 9.914965986394558e-05,
"loss": 3.2927,
"step": 70
},
{
"epoch": 0.029149207505920934,
"grad_norm": 0.57421875,
"learning_rate": 9.90281827016521e-05,
"loss": 3.275,
"step": 80
},
{
"epoch": 0.03279285844416105,
"grad_norm": 0.640625,
"learning_rate": 9.89067055393586e-05,
"loss": 3.316,
"step": 90
},
{
"epoch": 0.036436509382401165,
"grad_norm": 0.57421875,
"learning_rate": 9.878522837706513e-05,
"loss": 3.2611,
"step": 100
},
{
"epoch": 0.04008016032064128,
"grad_norm": 0.51171875,
"learning_rate": 9.866375121477162e-05,
"loss": 3.268,
"step": 110
},
{
"epoch": 0.0437238112588814,
"grad_norm": 0.703125,
"learning_rate": 9.854227405247813e-05,
"loss": 3.3032,
"step": 120
},
{
"epoch": 0.04736746219712151,
"grad_norm": 0.5546875,
"learning_rate": 9.842079689018465e-05,
"loss": 3.3334,
"step": 130
},
{
"epoch": 0.051011113135361635,
"grad_norm": 0.671875,
"learning_rate": 9.829931972789116e-05,
"loss": 3.1943,
"step": 140
},
{
"epoch": 0.05465476407360175,
"grad_norm": 0.6171875,
"learning_rate": 9.817784256559767e-05,
"loss": 3.2574,
"step": 150
},
{
"epoch": 0.05829841501184187,
"grad_norm": 0.66015625,
"learning_rate": 9.805636540330418e-05,
"loss": 3.3747,
"step": 160
},
{
"epoch": 0.06194206595008198,
"grad_norm": 0.52734375,
"learning_rate": 9.793488824101069e-05,
"loss": 3.2992,
"step": 170
},
{
"epoch": 0.0655857168883221,
"grad_norm": 0.50390625,
"learning_rate": 9.781341107871722e-05,
"loss": 3.2342,
"step": 180
},
{
"epoch": 0.06922936782656222,
"grad_norm": 0.65234375,
"learning_rate": 9.769193391642371e-05,
"loss": 3.356,
"step": 190
},
{
"epoch": 0.07287301876480233,
"grad_norm": 0.57421875,
"learning_rate": 9.757045675413022e-05,
"loss": 3.3618,
"step": 200
},
{
"epoch": 0.07651666970304245,
"grad_norm": 0.58984375,
"learning_rate": 9.744897959183674e-05,
"loss": 3.2931,
"step": 210
},
{
"epoch": 0.08016032064128256,
"grad_norm": 0.77734375,
"learning_rate": 9.732750242954325e-05,
"loss": 3.3246,
"step": 220
},
{
"epoch": 0.08380397157952268,
"grad_norm": 0.5859375,
"learning_rate": 9.720602526724975e-05,
"loss": 3.3181,
"step": 230
},
{
"epoch": 0.0874476225177628,
"grad_norm": 0.640625,
"learning_rate": 9.708454810495627e-05,
"loss": 3.2757,
"step": 240
},
{
"epoch": 0.09109127345600292,
"grad_norm": 0.55859375,
"learning_rate": 9.696307094266278e-05,
"loss": 3.2753,
"step": 250
},
{
"epoch": 0.09473492439424303,
"grad_norm": 0.58203125,
"learning_rate": 9.68415937803693e-05,
"loss": 3.3207,
"step": 260
},
{
"epoch": 0.09837857533248315,
"grad_norm": 0.63671875,
"learning_rate": 9.67201166180758e-05,
"loss": 3.3035,
"step": 270
},
{
"epoch": 0.10202222627072327,
"grad_norm": 0.578125,
"learning_rate": 9.659863945578231e-05,
"loss": 3.3025,
"step": 280
},
{
"epoch": 0.10566587720896338,
"grad_norm": 0.5859375,
"learning_rate": 9.647716229348883e-05,
"loss": 3.2066,
"step": 290
},
{
"epoch": 0.1093095281472035,
"grad_norm": 0.7109375,
"learning_rate": 9.635568513119534e-05,
"loss": 3.2757,
"step": 300
},
{
"epoch": 0.11295317908544361,
"grad_norm": 0.609375,
"learning_rate": 9.623420796890185e-05,
"loss": 3.1904,
"step": 310
},
{
"epoch": 0.11659683002368373,
"grad_norm": 0.60546875,
"learning_rate": 9.611273080660836e-05,
"loss": 3.1947,
"step": 320
},
{
"epoch": 0.12024048096192384,
"grad_norm": 0.6171875,
"learning_rate": 9.599125364431487e-05,
"loss": 3.2016,
"step": 330
},
{
"epoch": 0.12388413190016397,
"grad_norm": 0.640625,
"learning_rate": 9.58697764820214e-05,
"loss": 3.329,
"step": 340
},
{
"epoch": 0.12752778283840407,
"grad_norm": 0.66796875,
"learning_rate": 9.574829931972789e-05,
"loss": 3.2483,
"step": 350
},
{
"epoch": 0.1311714337766442,
"grad_norm": 0.57421875,
"learning_rate": 9.56268221574344e-05,
"loss": 3.2388,
"step": 360
},
{
"epoch": 0.13481508471488432,
"grad_norm": 0.58984375,
"learning_rate": 9.550534499514092e-05,
"loss": 3.2722,
"step": 370
},
{
"epoch": 0.13845873565312444,
"grad_norm": 0.58203125,
"learning_rate": 9.538386783284743e-05,
"loss": 3.2672,
"step": 380
},
{
"epoch": 0.14210238659136454,
"grad_norm": 0.5234375,
"learning_rate": 9.526239067055394e-05,
"loss": 3.3378,
"step": 390
},
{
"epoch": 0.14574603752960466,
"grad_norm": 0.55859375,
"learning_rate": 9.514091350826045e-05,
"loss": 3.2637,
"step": 400
},
{
"epoch": 0.14938968846784478,
"grad_norm": 0.70703125,
"learning_rate": 9.501943634596696e-05,
"loss": 3.2879,
"step": 410
},
{
"epoch": 0.1530333394060849,
"grad_norm": 0.6640625,
"learning_rate": 9.489795918367348e-05,
"loss": 3.2614,
"step": 420
},
{
"epoch": 0.156676990344325,
"grad_norm": 0.625,
"learning_rate": 9.477648202137999e-05,
"loss": 3.2469,
"step": 430
},
{
"epoch": 0.16032064128256512,
"grad_norm": 0.5703125,
"learning_rate": 9.465500485908649e-05,
"loss": 3.1614,
"step": 440
},
{
"epoch": 0.16396429222080525,
"grad_norm": 0.59765625,
"learning_rate": 9.453352769679301e-05,
"loss": 3.2658,
"step": 450
},
{
"epoch": 0.16760794315904537,
"grad_norm": 0.6953125,
"learning_rate": 9.441205053449952e-05,
"loss": 3.3253,
"step": 460
},
{
"epoch": 0.1712515940972855,
"grad_norm": 0.67578125,
"learning_rate": 9.429057337220603e-05,
"loss": 3.2311,
"step": 470
},
{
"epoch": 0.1748952450355256,
"grad_norm": 0.625,
"learning_rate": 9.416909620991254e-05,
"loss": 3.3117,
"step": 480
},
{
"epoch": 0.1785388959737657,
"grad_norm": 0.6640625,
"learning_rate": 9.404761904761905e-05,
"loss": 3.3513,
"step": 490
},
{
"epoch": 0.18218254691200583,
"grad_norm": 0.5703125,
"learning_rate": 9.392614188532556e-05,
"loss": 3.3071,
"step": 500
},
{
"epoch": 0.18582619785024596,
"grad_norm": 0.5703125,
"learning_rate": 9.380466472303208e-05,
"loss": 3.3047,
"step": 510
},
{
"epoch": 0.18946984878848605,
"grad_norm": 0.58984375,
"learning_rate": 9.368318756073858e-05,
"loss": 3.1964,
"step": 520
},
{
"epoch": 0.19311349972672617,
"grad_norm": 0.57421875,
"learning_rate": 9.35617103984451e-05,
"loss": 3.2459,
"step": 530
},
{
"epoch": 0.1967571506649663,
"grad_norm": 0.62109375,
"learning_rate": 9.344023323615161e-05,
"loss": 3.205,
"step": 540
},
{
"epoch": 0.20040080160320642,
"grad_norm": 0.66015625,
"learning_rate": 9.331875607385812e-05,
"loss": 3.2856,
"step": 550
},
{
"epoch": 0.20404445254144654,
"grad_norm": 0.52734375,
"learning_rate": 9.319727891156463e-05,
"loss": 3.185,
"step": 560
},
{
"epoch": 0.20768810347968664,
"grad_norm": 0.5546875,
"learning_rate": 9.307580174927114e-05,
"loss": 3.3071,
"step": 570
},
{
"epoch": 0.21133175441792676,
"grad_norm": 0.63671875,
"learning_rate": 9.295432458697765e-05,
"loss": 3.2363,
"step": 580
},
{
"epoch": 0.21497540535616688,
"grad_norm": 0.5625,
"learning_rate": 9.283284742468417e-05,
"loss": 3.2697,
"step": 590
},
{
"epoch": 0.218619056294407,
"grad_norm": 0.56640625,
"learning_rate": 9.271137026239067e-05,
"loss": 3.3037,
"step": 600
},
{
"epoch": 0.2222627072326471,
"grad_norm": 0.53125,
"learning_rate": 9.258989310009719e-05,
"loss": 3.2371,
"step": 610
},
{
"epoch": 0.22590635817088722,
"grad_norm": 0.61328125,
"learning_rate": 9.24684159378037e-05,
"loss": 3.3367,
"step": 620
},
{
"epoch": 0.22955000910912735,
"grad_norm": 0.5703125,
"learning_rate": 9.234693877551021e-05,
"loss": 3.2109,
"step": 630
},
{
"epoch": 0.23319366004736747,
"grad_norm": 0.59375,
"learning_rate": 9.222546161321672e-05,
"loss": 3.2374,
"step": 640
},
{
"epoch": 0.2368373109856076,
"grad_norm": 0.6875,
"learning_rate": 9.210398445092323e-05,
"loss": 3.3066,
"step": 650
},
{
"epoch": 0.24048096192384769,
"grad_norm": 0.6484375,
"learning_rate": 9.198250728862974e-05,
"loss": 3.2635,
"step": 660
},
{
"epoch": 0.2441246128620878,
"grad_norm": 0.60546875,
"learning_rate": 9.186103012633626e-05,
"loss": 3.26,
"step": 670
},
{
"epoch": 0.24776826380032793,
"grad_norm": 0.65234375,
"learning_rate": 9.173955296404276e-05,
"loss": 3.2641,
"step": 680
},
{
"epoch": 0.25141191473856805,
"grad_norm": 0.6015625,
"learning_rate": 9.161807580174927e-05,
"loss": 3.2907,
"step": 690
},
{
"epoch": 0.25505556567680815,
"grad_norm": 0.54296875,
"learning_rate": 9.149659863945579e-05,
"loss": 3.2567,
"step": 700
},
{
"epoch": 0.2586992166150483,
"grad_norm": 0.62890625,
"learning_rate": 9.13751214771623e-05,
"loss": 3.2838,
"step": 710
},
{
"epoch": 0.2623428675532884,
"grad_norm": 0.546875,
"learning_rate": 9.125364431486881e-05,
"loss": 3.2969,
"step": 720
},
{
"epoch": 0.2659865184915285,
"grad_norm": 0.6328125,
"learning_rate": 9.113216715257532e-05,
"loss": 3.2212,
"step": 730
},
{
"epoch": 0.26963016942976864,
"grad_norm": 0.6328125,
"learning_rate": 9.101068999028183e-05,
"loss": 3.212,
"step": 740
},
{
"epoch": 0.27327382036800874,
"grad_norm": 0.5859375,
"learning_rate": 9.088921282798835e-05,
"loss": 3.3488,
"step": 750
},
{
"epoch": 0.2769174713062489,
"grad_norm": 0.546875,
"learning_rate": 9.076773566569486e-05,
"loss": 3.2143,
"step": 760
},
{
"epoch": 0.280561122244489,
"grad_norm": 0.56640625,
"learning_rate": 9.064625850340136e-05,
"loss": 3.2518,
"step": 770
},
{
"epoch": 0.2842047731827291,
"grad_norm": 0.578125,
"learning_rate": 9.052478134110788e-05,
"loss": 3.2638,
"step": 780
},
{
"epoch": 0.2878484241209692,
"grad_norm": 0.58203125,
"learning_rate": 9.040330417881439e-05,
"loss": 3.2584,
"step": 790
},
{
"epoch": 0.2914920750592093,
"grad_norm": 0.62890625,
"learning_rate": 9.02818270165209e-05,
"loss": 3.2841,
"step": 800
},
{
"epoch": 0.29513572599744947,
"grad_norm": 0.55078125,
"learning_rate": 9.01603498542274e-05,
"loss": 3.261,
"step": 810
},
{
"epoch": 0.29877937693568957,
"grad_norm": 0.6171875,
"learning_rate": 9.003887269193392e-05,
"loss": 3.2954,
"step": 820
},
{
"epoch": 0.30242302787392966,
"grad_norm": 0.54296875,
"learning_rate": 8.991739552964044e-05,
"loss": 3.2337,
"step": 830
},
{
"epoch": 0.3060666788121698,
"grad_norm": 0.6171875,
"learning_rate": 8.979591836734695e-05,
"loss": 3.2881,
"step": 840
},
{
"epoch": 0.3097103297504099,
"grad_norm": 0.5546875,
"learning_rate": 8.967444120505344e-05,
"loss": 3.3519,
"step": 850
},
{
"epoch": 0.31335398068865,
"grad_norm": 0.5859375,
"learning_rate": 8.955296404275997e-05,
"loss": 3.3147,
"step": 860
},
{
"epoch": 0.31699763162689015,
"grad_norm": 0.62890625,
"learning_rate": 8.943148688046648e-05,
"loss": 3.2304,
"step": 870
},
{
"epoch": 0.32064128256513025,
"grad_norm": 0.60546875,
"learning_rate": 8.931000971817299e-05,
"loss": 3.2526,
"step": 880
},
{
"epoch": 0.3242849335033704,
"grad_norm": 0.6640625,
"learning_rate": 8.91885325558795e-05,
"loss": 3.309,
"step": 890
},
{
"epoch": 0.3279285844416105,
"grad_norm": 0.6484375,
"learning_rate": 8.9067055393586e-05,
"loss": 3.2513,
"step": 900
},
{
"epoch": 0.3315722353798506,
"grad_norm": 0.5703125,
"learning_rate": 8.894557823129253e-05,
"loss": 3.2135,
"step": 910
},
{
"epoch": 0.33521588631809074,
"grad_norm": 0.64453125,
"learning_rate": 8.882410106899904e-05,
"loss": 3.3048,
"step": 920
},
{
"epoch": 0.33885953725633083,
"grad_norm": 0.6015625,
"learning_rate": 8.870262390670553e-05,
"loss": 3.3047,
"step": 930
},
{
"epoch": 0.342503188194571,
"grad_norm": 0.6015625,
"learning_rate": 8.858114674441206e-05,
"loss": 3.2616,
"step": 940
},
{
"epoch": 0.3461468391328111,
"grad_norm": 0.5859375,
"learning_rate": 8.845966958211857e-05,
"loss": 3.2697,
"step": 950
},
{
"epoch": 0.3497904900710512,
"grad_norm": 0.72265625,
"learning_rate": 8.833819241982508e-05,
"loss": 3.2395,
"step": 960
},
{
"epoch": 0.3534341410092913,
"grad_norm": 0.61328125,
"learning_rate": 8.821671525753159e-05,
"loss": 3.2137,
"step": 970
},
{
"epoch": 0.3570777919475314,
"grad_norm": 0.625,
"learning_rate": 8.80952380952381e-05,
"loss": 3.2872,
"step": 980
},
{
"epoch": 0.36072144288577157,
"grad_norm": 0.5859375,
"learning_rate": 8.797376093294462e-05,
"loss": 3.2682,
"step": 990
},
{
"epoch": 0.36436509382401167,
"grad_norm": 0.5390625,
"learning_rate": 8.785228377065113e-05,
"loss": 3.204,
"step": 1000
},
{
"epoch": 0.36800874476225176,
"grad_norm": 0.71875,
"learning_rate": 8.773080660835762e-05,
"loss": 3.2472,
"step": 1010
},
{
"epoch": 0.3716523957004919,
"grad_norm": 0.609375,
"learning_rate": 8.760932944606415e-05,
"loss": 3.2638,
"step": 1020
},
{
"epoch": 0.375296046638732,
"grad_norm": 0.60546875,
"learning_rate": 8.748785228377066e-05,
"loss": 3.2803,
"step": 1030
},
{
"epoch": 0.3789396975769721,
"grad_norm": 0.66796875,
"learning_rate": 8.736637512147716e-05,
"loss": 3.273,
"step": 1040
},
{
"epoch": 0.38258334851521225,
"grad_norm": 0.65625,
"learning_rate": 8.724489795918367e-05,
"loss": 3.2854,
"step": 1050
},
{
"epoch": 0.38622699945345235,
"grad_norm": 0.640625,
"learning_rate": 8.712342079689018e-05,
"loss": 3.2373,
"step": 1060
},
{
"epoch": 0.3898706503916925,
"grad_norm": 0.55859375,
"learning_rate": 8.700194363459671e-05,
"loss": 3.2259,
"step": 1070
},
{
"epoch": 0.3935143013299326,
"grad_norm": 0.5078125,
"learning_rate": 8.688046647230322e-05,
"loss": 3.2402,
"step": 1080
},
{
"epoch": 0.3971579522681727,
"grad_norm": 0.61328125,
"learning_rate": 8.675898931000973e-05,
"loss": 3.2379,
"step": 1090
},
{
"epoch": 0.40080160320641284,
"grad_norm": 0.59375,
"learning_rate": 8.663751214771624e-05,
"loss": 3.2564,
"step": 1100
},
{
"epoch": 0.40444525414465293,
"grad_norm": 0.69921875,
"learning_rate": 8.651603498542274e-05,
"loss": 3.2342,
"step": 1110
},
{
"epoch": 0.4080889050828931,
"grad_norm": 0.53125,
"learning_rate": 8.639455782312925e-05,
"loss": 3.3336,
"step": 1120
},
{
"epoch": 0.4117325560211332,
"grad_norm": 0.63671875,
"learning_rate": 8.627308066083576e-05,
"loss": 3.2684,
"step": 1130
},
{
"epoch": 0.4153762069593733,
"grad_norm": 0.61328125,
"learning_rate": 8.615160349854227e-05,
"loss": 3.2581,
"step": 1140
},
{
"epoch": 0.4190198578976134,
"grad_norm": 0.50390625,
"learning_rate": 8.603012633624878e-05,
"loss": 3.3428,
"step": 1150
},
{
"epoch": 0.4226635088358535,
"grad_norm": 0.58203125,
"learning_rate": 8.59086491739553e-05,
"loss": 3.2331,
"step": 1160
},
{
"epoch": 0.42630715977409367,
"grad_norm": 0.63671875,
"learning_rate": 8.578717201166182e-05,
"loss": 3.2203,
"step": 1170
},
{
"epoch": 0.42995081071233376,
"grad_norm": 0.57421875,
"learning_rate": 8.566569484936832e-05,
"loss": 3.248,
"step": 1180
},
{
"epoch": 0.43359446165057386,
"grad_norm": 0.6015625,
"learning_rate": 8.554421768707483e-05,
"loss": 3.3052,
"step": 1190
},
{
"epoch": 0.437238112588814,
"grad_norm": 0.5546875,
"learning_rate": 8.542274052478134e-05,
"loss": 3.2036,
"step": 1200
},
{
"epoch": 0.4408817635270541,
"grad_norm": 0.64453125,
"learning_rate": 8.530126336248787e-05,
"loss": 3.2199,
"step": 1210
},
{
"epoch": 0.4445254144652942,
"grad_norm": 0.68359375,
"learning_rate": 8.517978620019436e-05,
"loss": 3.2594,
"step": 1220
},
{
"epoch": 0.44816906540353435,
"grad_norm": 0.6953125,
"learning_rate": 8.505830903790087e-05,
"loss": 3.26,
"step": 1230
},
{
"epoch": 0.45181271634177445,
"grad_norm": 0.66015625,
"learning_rate": 8.49368318756074e-05,
"loss": 3.3623,
"step": 1240
},
{
"epoch": 0.4554563672800146,
"grad_norm": 0.7421875,
"learning_rate": 8.48153547133139e-05,
"loss": 3.2625,
"step": 1250
},
{
"epoch": 0.4591000182182547,
"grad_norm": 0.6875,
"learning_rate": 8.469387755102041e-05,
"loss": 3.2738,
"step": 1260
},
{
"epoch": 0.4627436691564948,
"grad_norm": 0.61328125,
"learning_rate": 8.457240038872692e-05,
"loss": 3.2688,
"step": 1270
},
{
"epoch": 0.46638732009473494,
"grad_norm": 0.609375,
"learning_rate": 8.445092322643343e-05,
"loss": 3.2392,
"step": 1280
},
{
"epoch": 0.47003097103297503,
"grad_norm": 0.56640625,
"learning_rate": 8.432944606413996e-05,
"loss": 3.2414,
"step": 1290
},
{
"epoch": 0.4736746219712152,
"grad_norm": 0.640625,
"learning_rate": 8.420796890184645e-05,
"loss": 3.2461,
"step": 1300
},
{
"epoch": 0.4773182729094553,
"grad_norm": 0.578125,
"learning_rate": 8.408649173955296e-05,
"loss": 3.3459,
"step": 1310
},
{
"epoch": 0.48096192384769537,
"grad_norm": 0.6953125,
"learning_rate": 8.396501457725948e-05,
"loss": 3.2631,
"step": 1320
},
{
"epoch": 0.4846055747859355,
"grad_norm": 0.59765625,
"learning_rate": 8.3843537414966e-05,
"loss": 3.2883,
"step": 1330
},
{
"epoch": 0.4882492257241756,
"grad_norm": 0.625,
"learning_rate": 8.372206025267249e-05,
"loss": 3.2085,
"step": 1340
},
{
"epoch": 0.49189287666241577,
"grad_norm": 0.6640625,
"learning_rate": 8.360058309037901e-05,
"loss": 3.3132,
"step": 1350
},
{
"epoch": 0.49553652760065586,
"grad_norm": 0.61328125,
"learning_rate": 8.347910592808552e-05,
"loss": 3.3076,
"step": 1360
},
{
"epoch": 0.49918017853889596,
"grad_norm": 0.7265625,
"learning_rate": 8.335762876579204e-05,
"loss": 3.3183,
"step": 1370
},
{
"epoch": 0.5028238294771361,
"grad_norm": 0.55859375,
"learning_rate": 8.323615160349854e-05,
"loss": 3.1761,
"step": 1380
},
{
"epoch": 0.5064674804153763,
"grad_norm": 0.60546875,
"learning_rate": 8.311467444120505e-05,
"loss": 3.2079,
"step": 1390
},
{
"epoch": 0.5101111313536163,
"grad_norm": 0.703125,
"learning_rate": 8.299319727891157e-05,
"loss": 3.2844,
"step": 1400
},
{
"epoch": 0.5137547822918564,
"grad_norm": 0.578125,
"learning_rate": 8.287172011661808e-05,
"loss": 3.2492,
"step": 1410
},
{
"epoch": 0.5173984332300966,
"grad_norm": 0.6328125,
"learning_rate": 8.275024295432459e-05,
"loss": 3.2525,
"step": 1420
},
{
"epoch": 0.5210420841683366,
"grad_norm": 0.5703125,
"learning_rate": 8.26287657920311e-05,
"loss": 3.2449,
"step": 1430
},
{
"epoch": 0.5246857351065768,
"grad_norm": 0.54296875,
"learning_rate": 8.250728862973761e-05,
"loss": 3.2279,
"step": 1440
},
{
"epoch": 0.5283293860448169,
"grad_norm": 0.5859375,
"learning_rate": 8.238581146744413e-05,
"loss": 3.2751,
"step": 1450
},
{
"epoch": 0.531973036983057,
"grad_norm": 0.57421875,
"learning_rate": 8.226433430515063e-05,
"loss": 3.2404,
"step": 1460
},
{
"epoch": 0.5356166879212971,
"grad_norm": 0.67578125,
"learning_rate": 8.214285714285714e-05,
"loss": 3.2911,
"step": 1470
},
{
"epoch": 0.5392603388595373,
"grad_norm": 0.6796875,
"learning_rate": 8.202137998056366e-05,
"loss": 3.2637,
"step": 1480
},
{
"epoch": 0.5429039897977773,
"grad_norm": 0.61328125,
"learning_rate": 8.189990281827017e-05,
"loss": 3.2004,
"step": 1490
},
{
"epoch": 0.5465476407360175,
"grad_norm": 0.6875,
"learning_rate": 8.177842565597668e-05,
"loss": 3.2958,
"step": 1500
},
{
"epoch": 0.5501912916742576,
"grad_norm": 0.609375,
"learning_rate": 8.165694849368319e-05,
"loss": 3.2371,
"step": 1510
},
{
"epoch": 0.5538349426124978,
"grad_norm": 0.6171875,
"learning_rate": 8.15354713313897e-05,
"loss": 3.2798,
"step": 1520
},
{
"epoch": 0.5574785935507378,
"grad_norm": 0.6953125,
"learning_rate": 8.141399416909622e-05,
"loss": 3.2608,
"step": 1530
},
{
"epoch": 0.561122244488978,
"grad_norm": 0.62109375,
"learning_rate": 8.129251700680273e-05,
"loss": 3.2374,
"step": 1540
},
{
"epoch": 0.5647658954272181,
"grad_norm": 0.625,
"learning_rate": 8.117103984450923e-05,
"loss": 3.189,
"step": 1550
},
{
"epoch": 0.5684095463654582,
"grad_norm": 0.57421875,
"learning_rate": 8.104956268221575e-05,
"loss": 3.2008,
"step": 1560
},
{
"epoch": 0.5720531973036983,
"grad_norm": 0.58984375,
"learning_rate": 8.092808551992226e-05,
"loss": 3.219,
"step": 1570
},
{
"epoch": 0.5756968482419385,
"grad_norm": 0.58203125,
"learning_rate": 8.080660835762877e-05,
"loss": 3.2417,
"step": 1580
},
{
"epoch": 0.5793404991801785,
"grad_norm": 0.63671875,
"learning_rate": 8.068513119533528e-05,
"loss": 3.236,
"step": 1590
},
{
"epoch": 0.5829841501184186,
"grad_norm": 0.703125,
"learning_rate": 8.056365403304179e-05,
"loss": 3.3037,
"step": 1600
},
{
"epoch": 0.5866278010566588,
"grad_norm": 0.703125,
"learning_rate": 8.04421768707483e-05,
"loss": 3.2412,
"step": 1610
},
{
"epoch": 0.5902714519948989,
"grad_norm": 0.66796875,
"learning_rate": 8.032069970845482e-05,
"loss": 3.2293,
"step": 1620
},
{
"epoch": 0.593915102933139,
"grad_norm": 0.6640625,
"learning_rate": 8.019922254616132e-05,
"loss": 3.2208,
"step": 1630
},
{
"epoch": 0.5975587538713791,
"grad_norm": 0.671875,
"learning_rate": 8.007774538386784e-05,
"loss": 3.2251,
"step": 1640
},
{
"epoch": 0.6012024048096193,
"grad_norm": 0.63671875,
"learning_rate": 7.995626822157435e-05,
"loss": 3.284,
"step": 1650
},
{
"epoch": 0.6048460557478593,
"grad_norm": 0.6484375,
"learning_rate": 7.983479105928086e-05,
"loss": 3.2404,
"step": 1660
},
{
"epoch": 0.6084897066860995,
"grad_norm": 0.69140625,
"learning_rate": 7.971331389698737e-05,
"loss": 3.3335,
"step": 1670
},
{
"epoch": 0.6121333576243396,
"grad_norm": 0.59765625,
"learning_rate": 7.959183673469388e-05,
"loss": 3.276,
"step": 1680
},
{
"epoch": 0.6157770085625797,
"grad_norm": 0.63671875,
"learning_rate": 7.947035957240039e-05,
"loss": 3.2263,
"step": 1690
},
{
"epoch": 0.6194206595008198,
"grad_norm": 0.546875,
"learning_rate": 7.934888241010691e-05,
"loss": 3.1878,
"step": 1700
},
{
"epoch": 0.62306431043906,
"grad_norm": 0.625,
"learning_rate": 7.922740524781341e-05,
"loss": 3.294,
"step": 1710
},
{
"epoch": 0.6267079613773,
"grad_norm": 0.578125,
"learning_rate": 7.910592808551993e-05,
"loss": 3.2183,
"step": 1720
},
{
"epoch": 0.6303516123155402,
"grad_norm": 0.69140625,
"learning_rate": 7.898445092322644e-05,
"loss": 3.1985,
"step": 1730
},
{
"epoch": 0.6339952632537803,
"grad_norm": 0.74609375,
"learning_rate": 7.886297376093295e-05,
"loss": 3.1563,
"step": 1740
},
{
"epoch": 0.6376389141920205,
"grad_norm": 0.6484375,
"learning_rate": 7.874149659863946e-05,
"loss": 3.2806,
"step": 1750
},
{
"epoch": 0.6412825651302605,
"grad_norm": 0.6328125,
"learning_rate": 7.862001943634597e-05,
"loss": 3.2288,
"step": 1760
},
{
"epoch": 0.6449262160685006,
"grad_norm": 0.5859375,
"learning_rate": 7.849854227405248e-05,
"loss": 3.2785,
"step": 1770
},
{
"epoch": 0.6485698670067408,
"grad_norm": 0.6875,
"learning_rate": 7.8377065111759e-05,
"loss": 3.2952,
"step": 1780
},
{
"epoch": 0.6522135179449808,
"grad_norm": 0.6796875,
"learning_rate": 7.82555879494655e-05,
"loss": 3.1665,
"step": 1790
},
{
"epoch": 0.655857168883221,
"grad_norm": 0.6796875,
"learning_rate": 7.8134110787172e-05,
"loss": 3.1984,
"step": 1800
},
{
"epoch": 0.6595008198214611,
"grad_norm": 0.625,
"learning_rate": 7.801263362487853e-05,
"loss": 3.2051,
"step": 1810
},
{
"epoch": 0.6631444707597012,
"grad_norm": 0.6640625,
"learning_rate": 7.789115646258504e-05,
"loss": 3.2141,
"step": 1820
},
{
"epoch": 0.6667881216979413,
"grad_norm": 0.59375,
"learning_rate": 7.776967930029155e-05,
"loss": 3.312,
"step": 1830
},
{
"epoch": 0.6704317726361815,
"grad_norm": 0.65234375,
"learning_rate": 7.764820213799806e-05,
"loss": 3.2473,
"step": 1840
},
{
"epoch": 0.6740754235744215,
"grad_norm": 0.61328125,
"learning_rate": 7.752672497570457e-05,
"loss": 3.2924,
"step": 1850
},
{
"epoch": 0.6777190745126617,
"grad_norm": 0.71484375,
"learning_rate": 7.740524781341109e-05,
"loss": 3.2799,
"step": 1860
},
{
"epoch": 0.6813627254509018,
"grad_norm": 0.55078125,
"learning_rate": 7.72837706511176e-05,
"loss": 3.2251,
"step": 1870
},
{
"epoch": 0.685006376389142,
"grad_norm": 0.70703125,
"learning_rate": 7.71622934888241e-05,
"loss": 3.209,
"step": 1880
},
{
"epoch": 0.688650027327382,
"grad_norm": 0.63671875,
"learning_rate": 7.704081632653062e-05,
"loss": 3.2312,
"step": 1890
},
{
"epoch": 0.6922936782656222,
"grad_norm": 0.6328125,
"learning_rate": 7.691933916423713e-05,
"loss": 3.2487,
"step": 1900
},
{
"epoch": 0.6959373292038623,
"grad_norm": 0.5703125,
"learning_rate": 7.679786200194364e-05,
"loss": 3.3157,
"step": 1910
},
{
"epoch": 0.6995809801421023,
"grad_norm": 0.63671875,
"learning_rate": 7.667638483965015e-05,
"loss": 3.299,
"step": 1920
},
{
"epoch": 0.7032246310803425,
"grad_norm": 0.69140625,
"learning_rate": 7.655490767735666e-05,
"loss": 3.2755,
"step": 1930
},
{
"epoch": 0.7068682820185826,
"grad_norm": 0.625,
"learning_rate": 7.643343051506318e-05,
"loss": 3.317,
"step": 1940
},
{
"epoch": 0.7105119329568227,
"grad_norm": 0.55078125,
"learning_rate": 7.631195335276969e-05,
"loss": 3.1871,
"step": 1950
},
{
"epoch": 0.7141555838950628,
"grad_norm": 0.74609375,
"learning_rate": 7.619047619047618e-05,
"loss": 3.2405,
"step": 1960
},
{
"epoch": 0.717799234833303,
"grad_norm": 0.69921875,
"learning_rate": 7.606899902818271e-05,
"loss": 3.3068,
"step": 1970
},
{
"epoch": 0.7214428857715431,
"grad_norm": 0.578125,
"learning_rate": 7.594752186588922e-05,
"loss": 3.335,
"step": 1980
},
{
"epoch": 0.7250865367097832,
"grad_norm": 0.6484375,
"learning_rate": 7.582604470359573e-05,
"loss": 3.2617,
"step": 1990
},
{
"epoch": 0.7287301876480233,
"grad_norm": 0.5234375,
"learning_rate": 7.570456754130224e-05,
"loss": 3.2335,
"step": 2000
},
{
"epoch": 0.7323738385862635,
"grad_norm": 0.640625,
"learning_rate": 7.558309037900875e-05,
"loss": 3.2604,
"step": 2010
},
{
"epoch": 0.7360174895245035,
"grad_norm": 0.57421875,
"learning_rate": 7.546161321671527e-05,
"loss": 3.2632,
"step": 2020
},
{
"epoch": 0.7396611404627437,
"grad_norm": 0.61328125,
"learning_rate": 7.534013605442178e-05,
"loss": 3.2184,
"step": 2030
},
{
"epoch": 0.7433047914009838,
"grad_norm": 0.6171875,
"learning_rate": 7.521865889212827e-05,
"loss": 3.2848,
"step": 2040
},
{
"epoch": 0.7469484423392239,
"grad_norm": 0.6484375,
"learning_rate": 7.50971817298348e-05,
"loss": 3.2473,
"step": 2050
},
{
"epoch": 0.750592093277464,
"grad_norm": 0.6953125,
"learning_rate": 7.49757045675413e-05,
"loss": 3.195,
"step": 2060
},
{
"epoch": 0.7542357442157042,
"grad_norm": 0.73046875,
"learning_rate": 7.485422740524782e-05,
"loss": 3.2248,
"step": 2070
},
{
"epoch": 0.7578793951539442,
"grad_norm": 0.5390625,
"learning_rate": 7.473275024295433e-05,
"loss": 3.1511,
"step": 2080
},
{
"epoch": 0.7615230460921844,
"grad_norm": 0.66796875,
"learning_rate": 7.461127308066083e-05,
"loss": 3.2719,
"step": 2090
},
{
"epoch": 0.7651666970304245,
"grad_norm": 0.57421875,
"learning_rate": 7.448979591836736e-05,
"loss": 3.2339,
"step": 2100
},
{
"epoch": 0.7688103479686647,
"grad_norm": 0.61328125,
"learning_rate": 7.436831875607387e-05,
"loss": 3.2863,
"step": 2110
},
{
"epoch": 0.7724539989069047,
"grad_norm": 0.55859375,
"learning_rate": 7.424684159378036e-05,
"loss": 3.2057,
"step": 2120
},
{
"epoch": 0.7760976498451448,
"grad_norm": 0.73046875,
"learning_rate": 7.412536443148689e-05,
"loss": 3.2397,
"step": 2130
},
{
"epoch": 0.779741300783385,
"grad_norm": 0.59375,
"learning_rate": 7.40038872691934e-05,
"loss": 3.2323,
"step": 2140
},
{
"epoch": 0.783384951721625,
"grad_norm": 0.63671875,
"learning_rate": 7.38824101068999e-05,
"loss": 3.2764,
"step": 2150
},
{
"epoch": 0.7870286026598652,
"grad_norm": 0.60546875,
"learning_rate": 7.376093294460641e-05,
"loss": 3.2668,
"step": 2160
},
{
"epoch": 0.7906722535981053,
"grad_norm": 0.63671875,
"learning_rate": 7.363945578231292e-05,
"loss": 3.2953,
"step": 2170
},
{
"epoch": 0.7943159045363454,
"grad_norm": 0.5625,
"learning_rate": 7.351797862001945e-05,
"loss": 3.1915,
"step": 2180
},
{
"epoch": 0.7979595554745855,
"grad_norm": 0.66015625,
"learning_rate": 7.339650145772596e-05,
"loss": 3.2622,
"step": 2190
},
{
"epoch": 0.8016032064128257,
"grad_norm": 0.6171875,
"learning_rate": 7.327502429543247e-05,
"loss": 3.2522,
"step": 2200
},
{
"epoch": 0.8052468573510657,
"grad_norm": 0.64453125,
"learning_rate": 7.315354713313898e-05,
"loss": 3.1673,
"step": 2210
},
{
"epoch": 0.8088905082893059,
"grad_norm": 0.625,
"learning_rate": 7.303206997084548e-05,
"loss": 3.2722,
"step": 2220
},
{
"epoch": 0.812534159227546,
"grad_norm": 0.6640625,
"learning_rate": 7.2910592808552e-05,
"loss": 3.2377,
"step": 2230
},
{
"epoch": 0.8161778101657862,
"grad_norm": 0.6171875,
"learning_rate": 7.27891156462585e-05,
"loss": 3.179,
"step": 2240
},
{
"epoch": 0.8198214611040262,
"grad_norm": 0.57421875,
"learning_rate": 7.266763848396501e-05,
"loss": 3.2588,
"step": 2250
},
{
"epoch": 0.8234651120422664,
"grad_norm": 0.578125,
"learning_rate": 7.254616132167152e-05,
"loss": 3.2664,
"step": 2260
},
{
"epoch": 0.8271087629805065,
"grad_norm": 0.73046875,
"learning_rate": 7.242468415937805e-05,
"loss": 3.2515,
"step": 2270
},
{
"epoch": 0.8307524139187465,
"grad_norm": 0.6328125,
"learning_rate": 7.230320699708455e-05,
"loss": 3.2102,
"step": 2280
},
{
"epoch": 0.8343960648569867,
"grad_norm": 0.6484375,
"learning_rate": 7.218172983479106e-05,
"loss": 3.246,
"step": 2290
},
{
"epoch": 0.8380397157952268,
"grad_norm": 0.58203125,
"learning_rate": 7.206025267249757e-05,
"loss": 3.3321,
"step": 2300
},
{
"epoch": 0.8416833667334669,
"grad_norm": 0.59765625,
"learning_rate": 7.193877551020408e-05,
"loss": 3.0889,
"step": 2310
},
{
"epoch": 0.845327017671707,
"grad_norm": 0.66015625,
"learning_rate": 7.18172983479106e-05,
"loss": 3.2811,
"step": 2320
},
{
"epoch": 0.8489706686099472,
"grad_norm": 0.65234375,
"learning_rate": 7.16958211856171e-05,
"loss": 3.1688,
"step": 2330
},
{
"epoch": 0.8526143195481873,
"grad_norm": 0.76171875,
"learning_rate": 7.157434402332361e-05,
"loss": 3.2495,
"step": 2340
},
{
"epoch": 0.8562579704864274,
"grad_norm": 0.6484375,
"learning_rate": 7.145286686103013e-05,
"loss": 3.1742,
"step": 2350
},
{
"epoch": 0.8599016214246675,
"grad_norm": 0.5859375,
"learning_rate": 7.133138969873664e-05,
"loss": 3.2293,
"step": 2360
},
{
"epoch": 0.8635452723629077,
"grad_norm": 0.640625,
"learning_rate": 7.120991253644315e-05,
"loss": 3.2574,
"step": 2370
},
{
"epoch": 0.8671889233011477,
"grad_norm": 0.55078125,
"learning_rate": 7.108843537414966e-05,
"loss": 3.2496,
"step": 2380
},
{
"epoch": 0.8708325742393879,
"grad_norm": 0.7109375,
"learning_rate": 7.096695821185617e-05,
"loss": 3.2527,
"step": 2390
},
{
"epoch": 0.874476225177628,
"grad_norm": 0.6640625,
"learning_rate": 7.08454810495627e-05,
"loss": 3.1984,
"step": 2400
},
{
"epoch": 0.8781198761158681,
"grad_norm": 0.58984375,
"learning_rate": 7.072400388726919e-05,
"loss": 3.2517,
"step": 2410
},
{
"epoch": 0.8817635270541082,
"grad_norm": 0.6171875,
"learning_rate": 7.06025267249757e-05,
"loss": 3.2105,
"step": 2420
},
{
"epoch": 0.8854071779923484,
"grad_norm": 0.62890625,
"learning_rate": 7.048104956268222e-05,
"loss": 3.2125,
"step": 2430
},
{
"epoch": 0.8890508289305884,
"grad_norm": 0.72265625,
"learning_rate": 7.035957240038873e-05,
"loss": 3.255,
"step": 2440
},
{
"epoch": 0.8926944798688285,
"grad_norm": 0.671875,
"learning_rate": 7.023809523809524e-05,
"loss": 3.3331,
"step": 2450
},
{
"epoch": 0.8963381308070687,
"grad_norm": 0.65234375,
"learning_rate": 7.011661807580175e-05,
"loss": 3.3545,
"step": 2460
},
{
"epoch": 0.8999817817453089,
"grad_norm": 0.62890625,
"learning_rate": 6.999514091350826e-05,
"loss": 3.2776,
"step": 2470
},
{
"epoch": 0.9036254326835489,
"grad_norm": 0.76953125,
"learning_rate": 6.987366375121478e-05,
"loss": 3.2331,
"step": 2480
},
{
"epoch": 0.907269083621789,
"grad_norm": 0.78515625,
"learning_rate": 6.975218658892128e-05,
"loss": 3.2803,
"step": 2490
},
{
"epoch": 0.9109127345600292,
"grad_norm": 0.671875,
"learning_rate": 6.963070942662779e-05,
"loss": 3.256,
"step": 2500
},
{
"epoch": 0.9145563854982692,
"grad_norm": 0.59765625,
"learning_rate": 6.950923226433431e-05,
"loss": 3.2896,
"step": 2510
},
{
"epoch": 0.9182000364365094,
"grad_norm": 0.62890625,
"learning_rate": 6.938775510204082e-05,
"loss": 3.2555,
"step": 2520
},
{
"epoch": 0.9218436873747495,
"grad_norm": 0.7421875,
"learning_rate": 6.926627793974733e-05,
"loss": 3.2682,
"step": 2530
},
{
"epoch": 0.9254873383129896,
"grad_norm": 0.671875,
"learning_rate": 6.914480077745384e-05,
"loss": 3.1564,
"step": 2540
},
{
"epoch": 0.9291309892512297,
"grad_norm": 0.6484375,
"learning_rate": 6.902332361516035e-05,
"loss": 3.1445,
"step": 2550
},
{
"epoch": 0.9327746401894699,
"grad_norm": 0.51953125,
"learning_rate": 6.890184645286687e-05,
"loss": 3.2515,
"step": 2560
},
{
"epoch": 0.9364182911277099,
"grad_norm": 0.65625,
"learning_rate": 6.878036929057337e-05,
"loss": 3.1962,
"step": 2570
},
{
"epoch": 0.9400619420659501,
"grad_norm": 0.59375,
"learning_rate": 6.865889212827988e-05,
"loss": 3.3199,
"step": 2580
},
{
"epoch": 0.9437055930041902,
"grad_norm": 0.65234375,
"learning_rate": 6.85374149659864e-05,
"loss": 3.264,
"step": 2590
},
{
"epoch": 0.9473492439424304,
"grad_norm": 0.63671875,
"learning_rate": 6.841593780369291e-05,
"loss": 3.1853,
"step": 2600
},
{
"epoch": 0.9509928948806704,
"grad_norm": 0.72265625,
"learning_rate": 6.829446064139942e-05,
"loss": 3.3017,
"step": 2610
},
{
"epoch": 0.9546365458189106,
"grad_norm": 0.6953125,
"learning_rate": 6.817298347910593e-05,
"loss": 3.2358,
"step": 2620
},
{
"epoch": 0.9582801967571507,
"grad_norm": 0.6328125,
"learning_rate": 6.805150631681244e-05,
"loss": 3.2854,
"step": 2630
},
{
"epoch": 0.9619238476953907,
"grad_norm": 0.5859375,
"learning_rate": 6.793002915451895e-05,
"loss": 3.1873,
"step": 2640
},
{
"epoch": 0.9655674986336309,
"grad_norm": 0.59375,
"learning_rate": 6.780855199222547e-05,
"loss": 3.2274,
"step": 2650
},
{
"epoch": 0.969211149571871,
"grad_norm": 0.63671875,
"learning_rate": 6.768707482993197e-05,
"loss": 3.2037,
"step": 2660
},
{
"epoch": 0.9728548005101111,
"grad_norm": 0.5703125,
"learning_rate": 6.756559766763849e-05,
"loss": 3.3132,
"step": 2670
},
{
"epoch": 0.9764984514483512,
"grad_norm": 0.72265625,
"learning_rate": 6.7444120505345e-05,
"loss": 3.2734,
"step": 2680
},
{
"epoch": 0.9801421023865914,
"grad_norm": 0.70703125,
"learning_rate": 6.732264334305151e-05,
"loss": 3.1784,
"step": 2690
},
{
"epoch": 0.9837857533248315,
"grad_norm": 0.57421875,
"learning_rate": 6.720116618075802e-05,
"loss": 3.2181,
"step": 2700
},
{
"epoch": 0.9874294042630716,
"grad_norm": 0.6953125,
"learning_rate": 6.707968901846453e-05,
"loss": 3.2676,
"step": 2710
},
{
"epoch": 0.9910730552013117,
"grad_norm": 0.6875,
"learning_rate": 6.695821185617104e-05,
"loss": 3.1952,
"step": 2720
},
{
"epoch": 0.9947167061395519,
"grad_norm": 0.609375,
"learning_rate": 6.683673469387756e-05,
"loss": 3.3135,
"step": 2730
},
{
"epoch": 0.9983603570777919,
"grad_norm": 0.6484375,
"learning_rate": 6.671525753158406e-05,
"loss": 3.2643,
"step": 2740
},
{
"epoch": 1.002004008016032,
"grad_norm": 0.6015625,
"learning_rate": 6.659378036929058e-05,
"loss": 3.1996,
"step": 2750
},
{
"epoch": 1.0056476589542722,
"grad_norm": 0.75,
"learning_rate": 6.647230320699709e-05,
"loss": 3.0862,
"step": 2760
},
{
"epoch": 1.0092913098925123,
"grad_norm": 0.671875,
"learning_rate": 6.63508260447036e-05,
"loss": 3.1886,
"step": 2770
},
{
"epoch": 1.0129349608307525,
"grad_norm": 0.65625,
"learning_rate": 6.622934888241011e-05,
"loss": 3.1478,
"step": 2780
},
{
"epoch": 1.0165786117689926,
"grad_norm": 0.69921875,
"learning_rate": 6.610787172011662e-05,
"loss": 3.1577,
"step": 2790
},
{
"epoch": 1.0202222627072326,
"grad_norm": 0.77734375,
"learning_rate": 6.598639455782313e-05,
"loss": 3.148,
"step": 2800
},
{
"epoch": 1.0238659136454729,
"grad_norm": 0.640625,
"learning_rate": 6.586491739552965e-05,
"loss": 3.1971,
"step": 2810
},
{
"epoch": 1.027509564583713,
"grad_norm": 0.58984375,
"learning_rate": 6.574344023323615e-05,
"loss": 3.1351,
"step": 2820
},
{
"epoch": 1.031153215521953,
"grad_norm": 0.734375,
"learning_rate": 6.562196307094267e-05,
"loss": 3.2304,
"step": 2830
},
{
"epoch": 1.0347968664601932,
"grad_norm": 0.71484375,
"learning_rate": 6.550048590864918e-05,
"loss": 3.1582,
"step": 2840
},
{
"epoch": 1.0384405173984332,
"grad_norm": 0.71875,
"learning_rate": 6.537900874635569e-05,
"loss": 3.1183,
"step": 2850
},
{
"epoch": 1.0420841683366733,
"grad_norm": 0.8046875,
"learning_rate": 6.52575315840622e-05,
"loss": 3.2056,
"step": 2860
},
{
"epoch": 1.0457278192749135,
"grad_norm": 0.765625,
"learning_rate": 6.513605442176871e-05,
"loss": 3.1694,
"step": 2870
},
{
"epoch": 1.0493714702131536,
"grad_norm": 0.890625,
"learning_rate": 6.501457725947522e-05,
"loss": 3.1428,
"step": 2880
},
{
"epoch": 1.0530151211513936,
"grad_norm": 0.65625,
"learning_rate": 6.489310009718174e-05,
"loss": 3.1052,
"step": 2890
},
{
"epoch": 1.0566587720896339,
"grad_norm": 0.83203125,
"learning_rate": 6.477162293488824e-05,
"loss": 3.1195,
"step": 2900
},
{
"epoch": 1.060302423027874,
"grad_norm": 0.7421875,
"learning_rate": 6.465014577259475e-05,
"loss": 3.2278,
"step": 2910
},
{
"epoch": 1.063946073966114,
"grad_norm": 0.71875,
"learning_rate": 6.452866861030127e-05,
"loss": 3.1563,
"step": 2920
},
{
"epoch": 1.0675897249043542,
"grad_norm": 0.69140625,
"learning_rate": 6.440719144800778e-05,
"loss": 3.1505,
"step": 2930
},
{
"epoch": 1.0712333758425943,
"grad_norm": 0.8515625,
"learning_rate": 6.428571428571429e-05,
"loss": 3.1681,
"step": 2940
},
{
"epoch": 1.0748770267808343,
"grad_norm": 0.71484375,
"learning_rate": 6.41642371234208e-05,
"loss": 3.17,
"step": 2950
},
{
"epoch": 1.0785206777190746,
"grad_norm": 0.90625,
"learning_rate": 6.40427599611273e-05,
"loss": 3.1775,
"step": 2960
},
{
"epoch": 1.0821643286573146,
"grad_norm": 0.73828125,
"learning_rate": 6.392128279883383e-05,
"loss": 3.0921,
"step": 2970
},
{
"epoch": 1.0858079795955549,
"grad_norm": 0.75390625,
"learning_rate": 6.379980563654034e-05,
"loss": 3.1666,
"step": 2980
},
{
"epoch": 1.089451630533795,
"grad_norm": 0.80859375,
"learning_rate": 6.367832847424684e-05,
"loss": 3.1935,
"step": 2990
},
{
"epoch": 1.093095281472035,
"grad_norm": 0.67578125,
"learning_rate": 6.355685131195336e-05,
"loss": 3.0588,
"step": 3000
},
{
"epoch": 1.096738932410275,
"grad_norm": 0.74609375,
"learning_rate": 6.343537414965987e-05,
"loss": 3.1867,
"step": 3010
},
{
"epoch": 1.1003825833485152,
"grad_norm": 0.8828125,
"learning_rate": 6.331389698736638e-05,
"loss": 3.162,
"step": 3020
},
{
"epoch": 1.1040262342867553,
"grad_norm": 0.78515625,
"learning_rate": 6.319241982507289e-05,
"loss": 3.1737,
"step": 3030
},
{
"epoch": 1.1076698852249955,
"grad_norm": 0.76171875,
"learning_rate": 6.30709426627794e-05,
"loss": 3.1974,
"step": 3040
},
{
"epoch": 1.1113135361632356,
"grad_norm": 0.7734375,
"learning_rate": 6.294946550048592e-05,
"loss": 3.1584,
"step": 3050
},
{
"epoch": 1.1149571871014756,
"grad_norm": 0.74609375,
"learning_rate": 6.282798833819243e-05,
"loss": 3.1856,
"step": 3060
},
{
"epoch": 1.1186008380397159,
"grad_norm": 0.7109375,
"learning_rate": 6.270651117589892e-05,
"loss": 3.177,
"step": 3070
},
{
"epoch": 1.122244488977956,
"grad_norm": 0.85546875,
"learning_rate": 6.258503401360545e-05,
"loss": 3.2028,
"step": 3080
},
{
"epoch": 1.125888139916196,
"grad_norm": 0.93359375,
"learning_rate": 6.246355685131196e-05,
"loss": 3.2031,
"step": 3090
},
{
"epoch": 1.1295317908544362,
"grad_norm": 0.82421875,
"learning_rate": 6.234207968901847e-05,
"loss": 3.0629,
"step": 3100
},
{
"epoch": 1.1331754417926763,
"grad_norm": 0.6875,
"learning_rate": 6.222060252672498e-05,
"loss": 3.0927,
"step": 3110
},
{
"epoch": 1.1368190927309163,
"grad_norm": 0.765625,
"learning_rate": 6.209912536443149e-05,
"loss": 3.2134,
"step": 3120
},
{
"epoch": 1.1404627436691566,
"grad_norm": 0.84765625,
"learning_rate": 6.197764820213801e-05,
"loss": 3.2027,
"step": 3130
},
{
"epoch": 1.1441063946073966,
"grad_norm": 0.70703125,
"learning_rate": 6.185617103984452e-05,
"loss": 3.1448,
"step": 3140
},
{
"epoch": 1.1477500455456366,
"grad_norm": 0.70703125,
"learning_rate": 6.173469387755101e-05,
"loss": 3.1713,
"step": 3150
},
{
"epoch": 1.151393696483877,
"grad_norm": 0.77734375,
"learning_rate": 6.161321671525754e-05,
"loss": 3.1612,
"step": 3160
},
{
"epoch": 1.155037347422117,
"grad_norm": 0.79296875,
"learning_rate": 6.149173955296405e-05,
"loss": 3.1934,
"step": 3170
},
{
"epoch": 1.158680998360357,
"grad_norm": 0.89453125,
"learning_rate": 6.137026239067056e-05,
"loss": 3.1231,
"step": 3180
},
{
"epoch": 1.1623246492985972,
"grad_norm": 0.75390625,
"learning_rate": 6.124878522837707e-05,
"loss": 3.1606,
"step": 3190
},
{
"epoch": 1.1659683002368373,
"grad_norm": 0.75,
"learning_rate": 6.112730806608357e-05,
"loss": 3.135,
"step": 3200
},
{
"epoch": 1.1696119511750775,
"grad_norm": 0.78125,
"learning_rate": 6.10058309037901e-05,
"loss": 3.1592,
"step": 3210
},
{
"epoch": 1.1732556021133176,
"grad_norm": 0.84375,
"learning_rate": 6.08843537414966e-05,
"loss": 3.2429,
"step": 3220
},
{
"epoch": 1.1768992530515576,
"grad_norm": 0.921875,
"learning_rate": 6.076287657920311e-05,
"loss": 3.1182,
"step": 3230
},
{
"epoch": 1.1805429039897977,
"grad_norm": 0.83203125,
"learning_rate": 6.0641399416909626e-05,
"loss": 3.2273,
"step": 3240
},
{
"epoch": 1.184186554928038,
"grad_norm": 0.734375,
"learning_rate": 6.0519922254616135e-05,
"loss": 3.2101,
"step": 3250
},
{
"epoch": 1.187830205866278,
"grad_norm": 0.76953125,
"learning_rate": 6.0398445092322645e-05,
"loss": 3.1181,
"step": 3260
},
{
"epoch": 1.1914738568045182,
"grad_norm": 0.7265625,
"learning_rate": 6.027696793002916e-05,
"loss": 3.1349,
"step": 3270
},
{
"epoch": 1.1951175077427583,
"grad_norm": 0.90234375,
"learning_rate": 6.015549076773567e-05,
"loss": 3.152,
"step": 3280
},
{
"epoch": 1.1987611586809983,
"grad_norm": 0.75390625,
"learning_rate": 6.003401360544217e-05,
"loss": 3.1806,
"step": 3290
},
{
"epoch": 1.2024048096192386,
"grad_norm": 0.85546875,
"learning_rate": 5.991253644314869e-05,
"loss": 3.1708,
"step": 3300
},
{
"epoch": 1.2060484605574786,
"grad_norm": 0.78125,
"learning_rate": 5.97910592808552e-05,
"loss": 3.114,
"step": 3310
},
{
"epoch": 1.2096921114957186,
"grad_norm": 0.90625,
"learning_rate": 5.9669582118561715e-05,
"loss": 3.1852,
"step": 3320
},
{
"epoch": 1.213335762433959,
"grad_norm": 0.7578125,
"learning_rate": 5.9548104956268225e-05,
"loss": 3.2373,
"step": 3330
},
{
"epoch": 1.216979413372199,
"grad_norm": 0.8046875,
"learning_rate": 5.9426627793974734e-05,
"loss": 3.2133,
"step": 3340
},
{
"epoch": 1.220623064310439,
"grad_norm": 0.7890625,
"learning_rate": 5.930515063168125e-05,
"loss": 3.2556,
"step": 3350
},
{
"epoch": 1.2242667152486792,
"grad_norm": 0.71875,
"learning_rate": 5.918367346938776e-05,
"loss": 3.193,
"step": 3360
},
{
"epoch": 1.2279103661869193,
"grad_norm": 0.71484375,
"learning_rate": 5.906219630709426e-05,
"loss": 3.1619,
"step": 3370
},
{
"epoch": 1.2315540171251593,
"grad_norm": 0.94140625,
"learning_rate": 5.8940719144800785e-05,
"loss": 3.1265,
"step": 3380
},
{
"epoch": 1.2351976680633996,
"grad_norm": 0.80859375,
"learning_rate": 5.881924198250729e-05,
"loss": 3.2705,
"step": 3390
},
{
"epoch": 1.2388413190016396,
"grad_norm": 0.77734375,
"learning_rate": 5.8697764820213804e-05,
"loss": 3.1545,
"step": 3400
},
{
"epoch": 1.2424849699398797,
"grad_norm": 1.015625,
"learning_rate": 5.8576287657920314e-05,
"loss": 3.1632,
"step": 3410
},
{
"epoch": 1.24612862087812,
"grad_norm": 0.75390625,
"learning_rate": 5.845481049562682e-05,
"loss": 3.1776,
"step": 3420
},
{
"epoch": 1.24977227181636,
"grad_norm": 0.90625,
"learning_rate": 5.833333333333334e-05,
"loss": 3.1733,
"step": 3430
},
{
"epoch": 1.2534159227546002,
"grad_norm": 0.890625,
"learning_rate": 5.821185617103985e-05,
"loss": 3.0226,
"step": 3440
},
{
"epoch": 1.2570595736928403,
"grad_norm": 0.8046875,
"learning_rate": 5.809037900874635e-05,
"loss": 3.156,
"step": 3450
},
{
"epoch": 1.2607032246310803,
"grad_norm": 0.85546875,
"learning_rate": 5.7968901846452875e-05,
"loss": 3.0929,
"step": 3460
},
{
"epoch": 1.2643468755693203,
"grad_norm": 0.70703125,
"learning_rate": 5.784742468415938e-05,
"loss": 3.1027,
"step": 3470
},
{
"epoch": 1.2679905265075606,
"grad_norm": 0.76171875,
"learning_rate": 5.77259475218659e-05,
"loss": 3.2188,
"step": 3480
},
{
"epoch": 1.2716341774458007,
"grad_norm": 0.8671875,
"learning_rate": 5.76044703595724e-05,
"loss": 3.0835,
"step": 3490
},
{
"epoch": 1.275277828384041,
"grad_norm": 0.82421875,
"learning_rate": 5.748299319727891e-05,
"loss": 3.0709,
"step": 3500
},
{
"epoch": 1.278921479322281,
"grad_norm": 0.79296875,
"learning_rate": 5.736151603498543e-05,
"loss": 3.1397,
"step": 3510
},
{
"epoch": 1.282565130260521,
"grad_norm": 0.83203125,
"learning_rate": 5.724003887269194e-05,
"loss": 3.1717,
"step": 3520
},
{
"epoch": 1.286208781198761,
"grad_norm": 0.875,
"learning_rate": 5.711856171039844e-05,
"loss": 3.1881,
"step": 3530
},
{
"epoch": 1.2898524321370013,
"grad_norm": 0.859375,
"learning_rate": 5.6997084548104964e-05,
"loss": 3.1279,
"step": 3540
},
{
"epoch": 1.2934960830752413,
"grad_norm": 0.82421875,
"learning_rate": 5.6875607385811467e-05,
"loss": 3.1212,
"step": 3550
},
{
"epoch": 1.2971397340134816,
"grad_norm": 0.95703125,
"learning_rate": 5.6754130223517976e-05,
"loss": 3.1591,
"step": 3560
},
{
"epoch": 1.3007833849517216,
"grad_norm": 0.8125,
"learning_rate": 5.663265306122449e-05,
"loss": 3.1113,
"step": 3570
},
{
"epoch": 1.3044270358899617,
"grad_norm": 0.90234375,
"learning_rate": 5.6511175898931e-05,
"loss": 3.2222,
"step": 3580
},
{
"epoch": 1.308070686828202,
"grad_norm": 0.734375,
"learning_rate": 5.638969873663752e-05,
"loss": 3.1596,
"step": 3590
},
{
"epoch": 1.311714337766442,
"grad_norm": 0.76171875,
"learning_rate": 5.626822157434403e-05,
"loss": 3.1784,
"step": 3600
},
{
"epoch": 1.315357988704682,
"grad_norm": 0.7734375,
"learning_rate": 5.614674441205054e-05,
"loss": 3.1464,
"step": 3610
},
{
"epoch": 1.3190016396429223,
"grad_norm": 0.7265625,
"learning_rate": 5.602526724975705e-05,
"loss": 3.1616,
"step": 3620
},
{
"epoch": 1.3226452905811623,
"grad_norm": 0.81640625,
"learning_rate": 5.5903790087463556e-05,
"loss": 3.1747,
"step": 3630
},
{
"epoch": 1.3262889415194024,
"grad_norm": 0.88671875,
"learning_rate": 5.5782312925170065e-05,
"loss": 3.137,
"step": 3640
},
{
"epoch": 1.3299325924576426,
"grad_norm": 0.75390625,
"learning_rate": 5.566083576287658e-05,
"loss": 3.1302,
"step": 3650
},
{
"epoch": 1.3335762433958827,
"grad_norm": 0.79296875,
"learning_rate": 5.553935860058309e-05,
"loss": 3.2009,
"step": 3660
},
{
"epoch": 1.337219894334123,
"grad_norm": 0.8203125,
"learning_rate": 5.541788143828961e-05,
"loss": 3.1738,
"step": 3670
},
{
"epoch": 1.340863545272363,
"grad_norm": 0.83203125,
"learning_rate": 5.529640427599612e-05,
"loss": 3.0996,
"step": 3680
},
{
"epoch": 1.344507196210603,
"grad_norm": 1.1796875,
"learning_rate": 5.5174927113702626e-05,
"loss": 3.2209,
"step": 3690
},
{
"epoch": 1.348150847148843,
"grad_norm": 0.84765625,
"learning_rate": 5.505344995140914e-05,
"loss": 3.1315,
"step": 3700
},
{
"epoch": 1.3517944980870833,
"grad_norm": 0.78515625,
"learning_rate": 5.493197278911565e-05,
"loss": 3.1241,
"step": 3710
},
{
"epoch": 1.3554381490253233,
"grad_norm": 0.73046875,
"learning_rate": 5.4810495626822155e-05,
"loss": 3.2209,
"step": 3720
},
{
"epoch": 1.3590817999635636,
"grad_norm": 0.796875,
"learning_rate": 5.468901846452867e-05,
"loss": 3.1234,
"step": 3730
},
{
"epoch": 1.3627254509018036,
"grad_norm": 0.78515625,
"learning_rate": 5.456754130223518e-05,
"loss": 3.0762,
"step": 3740
},
{
"epoch": 1.3663691018400437,
"grad_norm": 0.8828125,
"learning_rate": 5.444606413994169e-05,
"loss": 3.1506,
"step": 3750
},
{
"epoch": 1.3700127527782837,
"grad_norm": 0.796875,
"learning_rate": 5.4324586977648206e-05,
"loss": 3.1047,
"step": 3760
},
{
"epoch": 1.373656403716524,
"grad_norm": 0.90234375,
"learning_rate": 5.4203109815354715e-05,
"loss": 3.1776,
"step": 3770
},
{
"epoch": 1.377300054654764,
"grad_norm": 0.859375,
"learning_rate": 5.408163265306123e-05,
"loss": 3.1998,
"step": 3780
},
{
"epoch": 1.3809437055930043,
"grad_norm": 0.87890625,
"learning_rate": 5.396015549076774e-05,
"loss": 3.3064,
"step": 3790
},
{
"epoch": 1.3845873565312443,
"grad_norm": 0.8671875,
"learning_rate": 5.3838678328474244e-05,
"loss": 3.1491,
"step": 3800
},
{
"epoch": 1.3882310074694844,
"grad_norm": 0.8828125,
"learning_rate": 5.371720116618077e-05,
"loss": 3.2158,
"step": 3810
},
{
"epoch": 1.3918746584077246,
"grad_norm": 0.84765625,
"learning_rate": 5.359572400388727e-05,
"loss": 3.0956,
"step": 3820
},
{
"epoch": 1.3955183093459647,
"grad_norm": 0.72265625,
"learning_rate": 5.347424684159378e-05,
"loss": 3.159,
"step": 3830
},
{
"epoch": 1.3991619602842047,
"grad_norm": 0.9296875,
"learning_rate": 5.3352769679300295e-05,
"loss": 3.2044,
"step": 3840
},
{
"epoch": 1.402805611222445,
"grad_norm": 0.76953125,
"learning_rate": 5.3231292517006805e-05,
"loss": 3.1354,
"step": 3850
},
{
"epoch": 1.406449262160685,
"grad_norm": 0.79296875,
"learning_rate": 5.310981535471332e-05,
"loss": 3.2342,
"step": 3860
},
{
"epoch": 1.410092913098925,
"grad_norm": 0.81640625,
"learning_rate": 5.298833819241983e-05,
"loss": 3.1566,
"step": 3870
},
{
"epoch": 1.4137365640371653,
"grad_norm": 0.8671875,
"learning_rate": 5.286686103012633e-05,
"loss": 3.1535,
"step": 3880
},
{
"epoch": 1.4173802149754053,
"grad_norm": 0.8046875,
"learning_rate": 5.2745383867832856e-05,
"loss": 3.1968,
"step": 3890
},
{
"epoch": 1.4210238659136456,
"grad_norm": 1.0390625,
"learning_rate": 5.262390670553936e-05,
"loss": 3.2237,
"step": 3900
},
{
"epoch": 1.4246675168518856,
"grad_norm": 0.8203125,
"learning_rate": 5.250242954324587e-05,
"loss": 3.154,
"step": 3910
},
{
"epoch": 1.4283111677901257,
"grad_norm": 0.921875,
"learning_rate": 5.2380952380952384e-05,
"loss": 3.2096,
"step": 3920
},
{
"epoch": 1.4319548187283657,
"grad_norm": 0.84765625,
"learning_rate": 5.2259475218658894e-05,
"loss": 3.1827,
"step": 3930
},
{
"epoch": 1.435598469666606,
"grad_norm": 1.0234375,
"learning_rate": 5.213799805636541e-05,
"loss": 3.1439,
"step": 3940
},
{
"epoch": 1.439242120604846,
"grad_norm": 0.78515625,
"learning_rate": 5.201652089407192e-05,
"loss": 3.1562,
"step": 3950
},
{
"epoch": 1.4428857715430863,
"grad_norm": 0.8828125,
"learning_rate": 5.189504373177842e-05,
"loss": 3.1539,
"step": 3960
},
{
"epoch": 1.4465294224813263,
"grad_norm": 0.75390625,
"learning_rate": 5.1773566569484945e-05,
"loss": 3.1449,
"step": 3970
},
{
"epoch": 1.4501730734195664,
"grad_norm": 0.94140625,
"learning_rate": 5.165208940719145e-05,
"loss": 3.1867,
"step": 3980
},
{
"epoch": 1.4538167243578064,
"grad_norm": 0.7578125,
"learning_rate": 5.153061224489796e-05,
"loss": 3.1182,
"step": 3990
},
{
"epoch": 1.4574603752960467,
"grad_norm": 0.83203125,
"learning_rate": 5.1409135082604474e-05,
"loss": 3.1719,
"step": 4000
},
{
"epoch": 1.4611040262342867,
"grad_norm": 0.87890625,
"learning_rate": 5.128765792031098e-05,
"loss": 3.2043,
"step": 4010
},
{
"epoch": 1.464747677172527,
"grad_norm": 0.875,
"learning_rate": 5.116618075801749e-05,
"loss": 3.1937,
"step": 4020
},
{
"epoch": 1.468391328110767,
"grad_norm": 0.72265625,
"learning_rate": 5.104470359572401e-05,
"loss": 3.1864,
"step": 4030
},
{
"epoch": 1.472034979049007,
"grad_norm": 0.8828125,
"learning_rate": 5.092322643343052e-05,
"loss": 3.1312,
"step": 4040
},
{
"epoch": 1.4756786299872473,
"grad_norm": 0.8828125,
"learning_rate": 5.0801749271137035e-05,
"loss": 3.0836,
"step": 4050
},
{
"epoch": 1.4793222809254873,
"grad_norm": 0.9296875,
"learning_rate": 5.068027210884354e-05,
"loss": 3.1631,
"step": 4060
},
{
"epoch": 1.4829659318637274,
"grad_norm": 0.77734375,
"learning_rate": 5.055879494655005e-05,
"loss": 3.1043,
"step": 4070
},
{
"epoch": 1.4866095828019676,
"grad_norm": 0.71875,
"learning_rate": 5.043731778425656e-05,
"loss": 3.281,
"step": 4080
},
{
"epoch": 1.4902532337402077,
"grad_norm": 0.6640625,
"learning_rate": 5.031584062196307e-05,
"loss": 3.1505,
"step": 4090
},
{
"epoch": 1.4938968846784477,
"grad_norm": 0.8359375,
"learning_rate": 5.019436345966958e-05,
"loss": 3.1435,
"step": 4100
},
{
"epoch": 1.497540535616688,
"grad_norm": 0.828125,
"learning_rate": 5.00728862973761e-05,
"loss": 3.1984,
"step": 4110
},
{
"epoch": 1.501184186554928,
"grad_norm": 0.984375,
"learning_rate": 4.995140913508261e-05,
"loss": 3.1526,
"step": 4120
},
{
"epoch": 1.5048278374931683,
"grad_norm": 0.84765625,
"learning_rate": 4.982993197278912e-05,
"loss": 3.1732,
"step": 4130
},
{
"epoch": 1.5084714884314083,
"grad_norm": 0.75,
"learning_rate": 4.970845481049563e-05,
"loss": 3.2319,
"step": 4140
},
{
"epoch": 1.5121151393696484,
"grad_norm": 0.76953125,
"learning_rate": 4.958697764820214e-05,
"loss": 3.1467,
"step": 4150
},
{
"epoch": 1.5157587903078884,
"grad_norm": 0.828125,
"learning_rate": 4.946550048590865e-05,
"loss": 3.0744,
"step": 4160
},
{
"epoch": 1.5194024412461287,
"grad_norm": 0.76171875,
"learning_rate": 4.934402332361516e-05,
"loss": 3.1253,
"step": 4170
},
{
"epoch": 1.5230460921843687,
"grad_norm": 0.83203125,
"learning_rate": 4.922254616132168e-05,
"loss": 3.1691,
"step": 4180
},
{
"epoch": 1.526689743122609,
"grad_norm": 0.6796875,
"learning_rate": 4.910106899902818e-05,
"loss": 3.1127,
"step": 4190
},
{
"epoch": 1.530333394060849,
"grad_norm": 0.8046875,
"learning_rate": 4.89795918367347e-05,
"loss": 3.199,
"step": 4200
},
{
"epoch": 1.533977044999089,
"grad_norm": 0.76171875,
"learning_rate": 4.8858114674441206e-05,
"loss": 3.1458,
"step": 4210
},
{
"epoch": 1.537620695937329,
"grad_norm": 0.90234375,
"learning_rate": 4.873663751214772e-05,
"loss": 3.144,
"step": 4220
},
{
"epoch": 1.5412643468755693,
"grad_norm": 0.86328125,
"learning_rate": 4.8615160349854225e-05,
"loss": 3.1611,
"step": 4230
},
{
"epoch": 1.5449079978138094,
"grad_norm": 0.9921875,
"learning_rate": 4.849368318756074e-05,
"loss": 3.2377,
"step": 4240
},
{
"epoch": 1.5485516487520496,
"grad_norm": 0.8046875,
"learning_rate": 4.837220602526725e-05,
"loss": 3.0911,
"step": 4250
},
{
"epoch": 1.5521952996902897,
"grad_norm": 0.859375,
"learning_rate": 4.825072886297377e-05,
"loss": 3.1286,
"step": 4260
},
{
"epoch": 1.5558389506285297,
"grad_norm": 0.875,
"learning_rate": 4.812925170068027e-05,
"loss": 3.1614,
"step": 4270
},
{
"epoch": 1.5594826015667698,
"grad_norm": 0.75390625,
"learning_rate": 4.8007774538386786e-05,
"loss": 3.1076,
"step": 4280
},
{
"epoch": 1.56312625250501,
"grad_norm": 0.83984375,
"learning_rate": 4.7886297376093295e-05,
"loss": 3.1806,
"step": 4290
},
{
"epoch": 1.5667699034432503,
"grad_norm": 0.80859375,
"learning_rate": 4.776482021379981e-05,
"loss": 3.1957,
"step": 4300
},
{
"epoch": 1.5704135543814903,
"grad_norm": 1.0546875,
"learning_rate": 4.7643343051506314e-05,
"loss": 3.1933,
"step": 4310
},
{
"epoch": 1.5740572053197304,
"grad_norm": 0.9375,
"learning_rate": 4.752186588921283e-05,
"loss": 3.212,
"step": 4320
},
{
"epoch": 1.5777008562579704,
"grad_norm": 0.8671875,
"learning_rate": 4.740038872691934e-05,
"loss": 3.1293,
"step": 4330
},
{
"epoch": 1.5813445071962104,
"grad_norm": 0.8359375,
"learning_rate": 4.7278911564625856e-05,
"loss": 3.2165,
"step": 4340
},
{
"epoch": 1.5849881581344507,
"grad_norm": 0.84765625,
"learning_rate": 4.715743440233236e-05,
"loss": 3.1911,
"step": 4350
},
{
"epoch": 1.588631809072691,
"grad_norm": 1.03125,
"learning_rate": 4.7035957240038875e-05,
"loss": 3.1359,
"step": 4360
},
{
"epoch": 1.592275460010931,
"grad_norm": 0.79296875,
"learning_rate": 4.6914480077745385e-05,
"loss": 3.2345,
"step": 4370
},
{
"epoch": 1.595919110949171,
"grad_norm": 0.80859375,
"learning_rate": 4.6793002915451894e-05,
"loss": 3.1874,
"step": 4380
},
{
"epoch": 1.599562761887411,
"grad_norm": 0.85546875,
"learning_rate": 4.667152575315841e-05,
"loss": 3.2192,
"step": 4390
},
{
"epoch": 1.6032064128256514,
"grad_norm": 0.7734375,
"learning_rate": 4.655004859086492e-05,
"loss": 3.1632,
"step": 4400
},
{
"epoch": 1.6068500637638914,
"grad_norm": 0.734375,
"learning_rate": 4.642857142857143e-05,
"loss": 3.1723,
"step": 4410
},
{
"epoch": 1.6104937147021317,
"grad_norm": 0.91015625,
"learning_rate": 4.630709426627794e-05,
"loss": 3.1858,
"step": 4420
},
{
"epoch": 1.6141373656403717,
"grad_norm": 0.84765625,
"learning_rate": 4.6185617103984455e-05,
"loss": 3.1226,
"step": 4430
},
{
"epoch": 1.6177810165786117,
"grad_norm": 0.87109375,
"learning_rate": 4.6064139941690965e-05,
"loss": 3.2065,
"step": 4440
},
{
"epoch": 1.6214246675168518,
"grad_norm": 0.87890625,
"learning_rate": 4.5942662779397474e-05,
"loss": 3.105,
"step": 4450
},
{
"epoch": 1.625068318455092,
"grad_norm": 0.9609375,
"learning_rate": 4.5821185617103983e-05,
"loss": 3.1379,
"step": 4460
},
{
"epoch": 1.628711969393332,
"grad_norm": 0.75,
"learning_rate": 4.56997084548105e-05,
"loss": 3.1684,
"step": 4470
},
{
"epoch": 1.6323556203315723,
"grad_norm": 0.74609375,
"learning_rate": 4.557823129251701e-05,
"loss": 3.1278,
"step": 4480
},
{
"epoch": 1.6359992712698124,
"grad_norm": 0.84765625,
"learning_rate": 4.5456754130223525e-05,
"loss": 3.1971,
"step": 4490
},
{
"epoch": 1.6396429222080524,
"grad_norm": 0.9296875,
"learning_rate": 4.533527696793003e-05,
"loss": 3.1004,
"step": 4500
},
{
"epoch": 1.6432865731462925,
"grad_norm": 0.81640625,
"learning_rate": 4.5213799805636544e-05,
"loss": 3.1026,
"step": 4510
},
{
"epoch": 1.6469302240845327,
"grad_norm": 0.80859375,
"learning_rate": 4.5092322643343054e-05,
"loss": 3.1681,
"step": 4520
},
{
"epoch": 1.650573875022773,
"grad_norm": 0.7109375,
"learning_rate": 4.497084548104957e-05,
"loss": 3.185,
"step": 4530
},
{
"epoch": 1.654217525961013,
"grad_norm": 0.859375,
"learning_rate": 4.484936831875607e-05,
"loss": 3.1992,
"step": 4540
},
{
"epoch": 1.657861176899253,
"grad_norm": 0.953125,
"learning_rate": 4.472789115646259e-05,
"loss": 3.1486,
"step": 4550
},
{
"epoch": 1.661504827837493,
"grad_norm": 0.8671875,
"learning_rate": 4.46064139941691e-05,
"loss": 3.1765,
"step": 4560
},
{
"epoch": 1.6651484787757331,
"grad_norm": 0.77734375,
"learning_rate": 4.4484936831875615e-05,
"loss": 3.1672,
"step": 4570
},
{
"epoch": 1.6687921297139734,
"grad_norm": 0.734375,
"learning_rate": 4.436345966958212e-05,
"loss": 3.1509,
"step": 4580
},
{
"epoch": 1.6724357806522137,
"grad_norm": 0.84765625,
"learning_rate": 4.4241982507288634e-05,
"loss": 3.1479,
"step": 4590
},
{
"epoch": 1.6760794315904537,
"grad_norm": 0.84765625,
"learning_rate": 4.412050534499514e-05,
"loss": 3.1274,
"step": 4600
},
{
"epoch": 1.6797230825286937,
"grad_norm": 0.859375,
"learning_rate": 4.399902818270165e-05,
"loss": 3.1988,
"step": 4610
},
{
"epoch": 1.6833667334669338,
"grad_norm": 0.765625,
"learning_rate": 4.387755102040816e-05,
"loss": 3.1433,
"step": 4620
},
{
"epoch": 1.687010384405174,
"grad_norm": 0.76171875,
"learning_rate": 4.375607385811468e-05,
"loss": 3.1616,
"step": 4630
},
{
"epoch": 1.690654035343414,
"grad_norm": 0.8515625,
"learning_rate": 4.363459669582119e-05,
"loss": 3.2244,
"step": 4640
},
{
"epoch": 1.6942976862816543,
"grad_norm": 0.9921875,
"learning_rate": 4.35131195335277e-05,
"loss": 3.2014,
"step": 4650
},
{
"epoch": 1.6979413372198944,
"grad_norm": 0.859375,
"learning_rate": 4.3391642371234207e-05,
"loss": 3.1558,
"step": 4660
},
{
"epoch": 1.7015849881581344,
"grad_norm": 0.93359375,
"learning_rate": 4.327016520894072e-05,
"loss": 3.1166,
"step": 4670
},
{
"epoch": 1.7052286390963745,
"grad_norm": 0.89453125,
"learning_rate": 4.314868804664723e-05,
"loss": 3.1352,
"step": 4680
},
{
"epoch": 1.7088722900346147,
"grad_norm": 0.9453125,
"learning_rate": 4.302721088435374e-05,
"loss": 3.1346,
"step": 4690
},
{
"epoch": 1.7125159409728548,
"grad_norm": 0.7890625,
"learning_rate": 4.290573372206025e-05,
"loss": 3.1268,
"step": 4700
},
{
"epoch": 1.716159591911095,
"grad_norm": 0.8828125,
"learning_rate": 4.278425655976677e-05,
"loss": 3.2035,
"step": 4710
},
{
"epoch": 1.719803242849335,
"grad_norm": 0.75390625,
"learning_rate": 4.266277939747328e-05,
"loss": 3.153,
"step": 4720
},
{
"epoch": 1.723446893787575,
"grad_norm": 0.78125,
"learning_rate": 4.2541302235179786e-05,
"loss": 3.1211,
"step": 4730
},
{
"epoch": 1.7270905447258151,
"grad_norm": 0.75,
"learning_rate": 4.2419825072886296e-05,
"loss": 3.1218,
"step": 4740
},
{
"epoch": 1.7307341956640554,
"grad_norm": 0.9140625,
"learning_rate": 4.229834791059281e-05,
"loss": 3.1847,
"step": 4750
},
{
"epoch": 1.7343778466022957,
"grad_norm": 0.99609375,
"learning_rate": 4.217687074829932e-05,
"loss": 3.1372,
"step": 4760
},
{
"epoch": 1.7380214975405357,
"grad_norm": 0.8359375,
"learning_rate": 4.205539358600583e-05,
"loss": 3.1543,
"step": 4770
},
{
"epoch": 1.7416651484787757,
"grad_norm": 0.94140625,
"learning_rate": 4.193391642371235e-05,
"loss": 3.2583,
"step": 4780
},
{
"epoch": 1.7453087994170158,
"grad_norm": 1.0703125,
"learning_rate": 4.181243926141886e-05,
"loss": 3.2017,
"step": 4790
},
{
"epoch": 1.7489524503552558,
"grad_norm": 0.84375,
"learning_rate": 4.1690962099125366e-05,
"loss": 3.1221,
"step": 4800
},
{
"epoch": 1.752596101293496,
"grad_norm": 0.9140625,
"learning_rate": 4.1569484936831876e-05,
"loss": 3.017,
"step": 4810
},
{
"epoch": 1.7562397522317363,
"grad_norm": 0.84375,
"learning_rate": 4.144800777453839e-05,
"loss": 3.0838,
"step": 4820
},
{
"epoch": 1.7598834031699764,
"grad_norm": 0.83984375,
"learning_rate": 4.13265306122449e-05,
"loss": 3.1651,
"step": 4830
},
{
"epoch": 1.7635270541082164,
"grad_norm": 0.74609375,
"learning_rate": 4.120505344995141e-05,
"loss": 3.2177,
"step": 4840
},
{
"epoch": 1.7671707050464565,
"grad_norm": 0.91796875,
"learning_rate": 4.108357628765792e-05,
"loss": 3.2003,
"step": 4850
},
{
"epoch": 1.7708143559846967,
"grad_norm": 0.8359375,
"learning_rate": 4.0962099125364436e-05,
"loss": 3.2039,
"step": 4860
},
{
"epoch": 1.7744580069229368,
"grad_norm": 0.7890625,
"learning_rate": 4.0840621963070946e-05,
"loss": 3.1705,
"step": 4870
},
{
"epoch": 1.778101657861177,
"grad_norm": 0.8515625,
"learning_rate": 4.0719144800777455e-05,
"loss": 3.1413,
"step": 4880
},
{
"epoch": 1.781745308799417,
"grad_norm": 0.83203125,
"learning_rate": 4.0597667638483965e-05,
"loss": 3.177,
"step": 4890
},
{
"epoch": 1.785388959737657,
"grad_norm": 0.79296875,
"learning_rate": 4.047619047619048e-05,
"loss": 3.19,
"step": 4900
},
{
"epoch": 1.7890326106758971,
"grad_norm": 0.76171875,
"learning_rate": 4.035471331389699e-05,
"loss": 3.1219,
"step": 4910
},
{
"epoch": 1.7926762616141374,
"grad_norm": 0.8046875,
"learning_rate": 4.02332361516035e-05,
"loss": 3.2115,
"step": 4920
},
{
"epoch": 1.7963199125523774,
"grad_norm": 0.8671875,
"learning_rate": 4.011175898931001e-05,
"loss": 3.2519,
"step": 4930
},
{
"epoch": 1.7999635634906177,
"grad_norm": 0.76953125,
"learning_rate": 3.9990281827016526e-05,
"loss": 3.1165,
"step": 4940
},
{
"epoch": 1.8036072144288577,
"grad_norm": 0.890625,
"learning_rate": 3.9868804664723035e-05,
"loss": 3.1574,
"step": 4950
},
{
"epoch": 1.8072508653670978,
"grad_norm": 0.765625,
"learning_rate": 3.9747327502429545e-05,
"loss": 3.1719,
"step": 4960
},
{
"epoch": 1.8108945163053378,
"grad_norm": 0.87109375,
"learning_rate": 3.9625850340136054e-05,
"loss": 3.204,
"step": 4970
},
{
"epoch": 1.814538167243578,
"grad_norm": 0.7421875,
"learning_rate": 3.950437317784257e-05,
"loss": 3.1539,
"step": 4980
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.875,
"learning_rate": 3.938289601554908e-05,
"loss": 3.2391,
"step": 4990
},
{
"epoch": 1.8218254691200584,
"grad_norm": 0.88671875,
"learning_rate": 3.926141885325559e-05,
"loss": 3.2341,
"step": 5000
},
{
"epoch": 1.8254691200582984,
"grad_norm": 0.9296875,
"learning_rate": 3.91399416909621e-05,
"loss": 3.1369,
"step": 5010
},
{
"epoch": 1.8291127709965385,
"grad_norm": 0.83203125,
"learning_rate": 3.9018464528668615e-05,
"loss": 3.2285,
"step": 5020
},
{
"epoch": 1.8327564219347785,
"grad_norm": 0.84375,
"learning_rate": 3.8896987366375124e-05,
"loss": 3.1143,
"step": 5030
},
{
"epoch": 1.8364000728730188,
"grad_norm": 0.83203125,
"learning_rate": 3.8775510204081634e-05,
"loss": 3.1757,
"step": 5040
},
{
"epoch": 1.840043723811259,
"grad_norm": 0.87109375,
"learning_rate": 3.865403304178814e-05,
"loss": 3.0718,
"step": 5050
},
{
"epoch": 1.843687374749499,
"grad_norm": 1.0546875,
"learning_rate": 3.853255587949466e-05,
"loss": 3.0605,
"step": 5060
},
{
"epoch": 1.847331025687739,
"grad_norm": 0.96484375,
"learning_rate": 3.841107871720116e-05,
"loss": 3.1806,
"step": 5070
},
{
"epoch": 1.8509746766259791,
"grad_norm": 1.0234375,
"learning_rate": 3.828960155490768e-05,
"loss": 3.1507,
"step": 5080
},
{
"epoch": 1.8546183275642192,
"grad_norm": 0.78515625,
"learning_rate": 3.816812439261419e-05,
"loss": 3.1032,
"step": 5090
},
{
"epoch": 1.8582619785024594,
"grad_norm": 0.859375,
"learning_rate": 3.8046647230320704e-05,
"loss": 3.1767,
"step": 5100
},
{
"epoch": 1.8619056294406997,
"grad_norm": 0.9609375,
"learning_rate": 3.7925170068027214e-05,
"loss": 3.1871,
"step": 5110
},
{
"epoch": 1.8655492803789397,
"grad_norm": 0.83984375,
"learning_rate": 3.780369290573372e-05,
"loss": 3.2559,
"step": 5120
},
{
"epoch": 1.8691929313171798,
"grad_norm": 0.875,
"learning_rate": 3.768221574344023e-05,
"loss": 3.1719,
"step": 5130
},
{
"epoch": 1.8728365822554198,
"grad_norm": 0.83203125,
"learning_rate": 3.756073858114675e-05,
"loss": 3.1794,
"step": 5140
},
{
"epoch": 1.87648023319366,
"grad_norm": 0.7265625,
"learning_rate": 3.743926141885326e-05,
"loss": 3.1099,
"step": 5150
},
{
"epoch": 1.8801238841319001,
"grad_norm": 0.81640625,
"learning_rate": 3.731778425655977e-05,
"loss": 3.1963,
"step": 5160
},
{
"epoch": 1.8837675350701404,
"grad_norm": 0.8046875,
"learning_rate": 3.7196307094266284e-05,
"loss": 3.1068,
"step": 5170
},
{
"epoch": 1.8874111860083804,
"grad_norm": 0.7734375,
"learning_rate": 3.707482993197279e-05,
"loss": 3.1131,
"step": 5180
},
{
"epoch": 1.8910548369466205,
"grad_norm": 0.90625,
"learning_rate": 3.69533527696793e-05,
"loss": 3.2152,
"step": 5190
},
{
"epoch": 1.8946984878848605,
"grad_norm": 0.99609375,
"learning_rate": 3.683187560738581e-05,
"loss": 3.2625,
"step": 5200
},
{
"epoch": 1.8983421388231008,
"grad_norm": 0.95703125,
"learning_rate": 3.671039844509233e-05,
"loss": 3.1989,
"step": 5210
},
{
"epoch": 1.901985789761341,
"grad_norm": 0.84765625,
"learning_rate": 3.658892128279884e-05,
"loss": 3.2053,
"step": 5220
},
{
"epoch": 1.905629440699581,
"grad_norm": 0.8515625,
"learning_rate": 3.646744412050535e-05,
"loss": 3.2008,
"step": 5230
},
{
"epoch": 1.909273091637821,
"grad_norm": 0.80078125,
"learning_rate": 3.634596695821186e-05,
"loss": 3.1759,
"step": 5240
},
{
"epoch": 1.9129167425760611,
"grad_norm": 1.0234375,
"learning_rate": 3.622448979591837e-05,
"loss": 3.1949,
"step": 5250
},
{
"epoch": 1.9165603935143012,
"grad_norm": 0.93359375,
"learning_rate": 3.6103012633624876e-05,
"loss": 3.1669,
"step": 5260
},
{
"epoch": 1.9202040444525414,
"grad_norm": 0.9296875,
"learning_rate": 3.598153547133139e-05,
"loss": 3.1745,
"step": 5270
},
{
"epoch": 1.9238476953907817,
"grad_norm": 0.81640625,
"learning_rate": 3.58600583090379e-05,
"loss": 3.1438,
"step": 5280
},
{
"epoch": 1.9274913463290217,
"grad_norm": 0.7421875,
"learning_rate": 3.573858114674442e-05,
"loss": 3.1506,
"step": 5290
},
{
"epoch": 1.9311349972672618,
"grad_norm": 0.82421875,
"learning_rate": 3.561710398445092e-05,
"loss": 3.1507,
"step": 5300
},
{
"epoch": 1.9347786482055018,
"grad_norm": 1.140625,
"learning_rate": 3.549562682215744e-05,
"loss": 3.1188,
"step": 5310
},
{
"epoch": 1.9384222991437419,
"grad_norm": 0.88671875,
"learning_rate": 3.5374149659863946e-05,
"loss": 3.1295,
"step": 5320
},
{
"epoch": 1.9420659500819821,
"grad_norm": 0.93359375,
"learning_rate": 3.525267249757046e-05,
"loss": 3.2526,
"step": 5330
},
{
"epoch": 1.9457096010202224,
"grad_norm": 0.859375,
"learning_rate": 3.5131195335276965e-05,
"loss": 3.1166,
"step": 5340
},
{
"epoch": 1.9493532519584624,
"grad_norm": 0.86328125,
"learning_rate": 3.500971817298348e-05,
"loss": 3.1793,
"step": 5350
},
{
"epoch": 1.9529969028967025,
"grad_norm": 0.95703125,
"learning_rate": 3.488824101068999e-05,
"loss": 3.078,
"step": 5360
},
{
"epoch": 1.9566405538349425,
"grad_norm": 0.89453125,
"learning_rate": 3.476676384839651e-05,
"loss": 3.1149,
"step": 5370
},
{
"epoch": 1.9602842047731828,
"grad_norm": 0.8125,
"learning_rate": 3.464528668610301e-05,
"loss": 3.1697,
"step": 5380
},
{
"epoch": 1.9639278557114228,
"grad_norm": 0.7578125,
"learning_rate": 3.4523809523809526e-05,
"loss": 3.1781,
"step": 5390
},
{
"epoch": 1.967571506649663,
"grad_norm": 0.83984375,
"learning_rate": 3.4402332361516035e-05,
"loss": 3.11,
"step": 5400
},
{
"epoch": 1.971215157587903,
"grad_norm": 0.8203125,
"learning_rate": 3.428085519922255e-05,
"loss": 3.1757,
"step": 5410
},
{
"epoch": 1.9748588085261432,
"grad_norm": 0.89453125,
"learning_rate": 3.4159378036929054e-05,
"loss": 3.2046,
"step": 5420
},
{
"epoch": 1.9785024594643832,
"grad_norm": 0.96484375,
"learning_rate": 3.403790087463557e-05,
"loss": 3.2087,
"step": 5430
},
{
"epoch": 1.9821461104026235,
"grad_norm": 0.796875,
"learning_rate": 3.391642371234208e-05,
"loss": 3.1347,
"step": 5440
},
{
"epoch": 1.9857897613408635,
"grad_norm": 0.78125,
"learning_rate": 3.3794946550048596e-05,
"loss": 3.14,
"step": 5450
},
{
"epoch": 1.9894334122791038,
"grad_norm": 0.81640625,
"learning_rate": 3.36734693877551e-05,
"loss": 3.1691,
"step": 5460
},
{
"epoch": 1.9930770632173438,
"grad_norm": 0.86328125,
"learning_rate": 3.3551992225461615e-05,
"loss": 3.1885,
"step": 5470
},
{
"epoch": 1.9967207141555838,
"grad_norm": 0.7734375,
"learning_rate": 3.3430515063168125e-05,
"loss": 3.2177,
"step": 5480
},
{
"epoch": 2.000364365093824,
"grad_norm": 0.8515625,
"learning_rate": 3.3309037900874634e-05,
"loss": 3.2009,
"step": 5490
},
{
"epoch": 2.004008016032064,
"grad_norm": 0.890625,
"learning_rate": 3.318756073858115e-05,
"loss": 3.0367,
"step": 5500
},
{
"epoch": 2.0076516669703044,
"grad_norm": 0.875,
"learning_rate": 3.306608357628766e-05,
"loss": 3.0228,
"step": 5510
},
{
"epoch": 2.0112953179085444,
"grad_norm": 0.796875,
"learning_rate": 3.294460641399417e-05,
"loss": 3.0782,
"step": 5520
},
{
"epoch": 2.0149389688467845,
"grad_norm": 0.93359375,
"learning_rate": 3.282312925170068e-05,
"loss": 3.1693,
"step": 5530
},
{
"epoch": 2.0185826197850245,
"grad_norm": 0.81640625,
"learning_rate": 3.2701652089407195e-05,
"loss": 3.0759,
"step": 5540
},
{
"epoch": 2.0222262707232646,
"grad_norm": 0.9140625,
"learning_rate": 3.2580174927113704e-05,
"loss": 3.135,
"step": 5550
},
{
"epoch": 2.025869921661505,
"grad_norm": 0.9375,
"learning_rate": 3.245869776482022e-05,
"loss": 3.1345,
"step": 5560
},
{
"epoch": 2.029513572599745,
"grad_norm": 0.85546875,
"learning_rate": 3.233722060252672e-05,
"loss": 3.0659,
"step": 5570
},
{
"epoch": 2.033157223537985,
"grad_norm": 0.8046875,
"learning_rate": 3.221574344023324e-05,
"loss": 3.1813,
"step": 5580
},
{
"epoch": 2.036800874476225,
"grad_norm": 0.83984375,
"learning_rate": 3.209426627793975e-05,
"loss": 3.0976,
"step": 5590
},
{
"epoch": 2.040444525414465,
"grad_norm": 0.8203125,
"learning_rate": 3.1972789115646265e-05,
"loss": 3.1173,
"step": 5600
},
{
"epoch": 2.0440881763527052,
"grad_norm": 0.8203125,
"learning_rate": 3.185131195335277e-05,
"loss": 3.1076,
"step": 5610
},
{
"epoch": 2.0477318272909457,
"grad_norm": 0.859375,
"learning_rate": 3.1729834791059284e-05,
"loss": 3.112,
"step": 5620
},
{
"epoch": 2.0513754782291858,
"grad_norm": 0.95703125,
"learning_rate": 3.1608357628765794e-05,
"loss": 3.0959,
"step": 5630
},
{
"epoch": 2.055019129167426,
"grad_norm": 1.0078125,
"learning_rate": 3.148688046647231e-05,
"loss": 3.0968,
"step": 5640
},
{
"epoch": 2.058662780105666,
"grad_norm": 0.8671875,
"learning_rate": 3.136540330417881e-05,
"loss": 3.0484,
"step": 5650
},
{
"epoch": 2.062306431043906,
"grad_norm": 0.7734375,
"learning_rate": 3.124392614188533e-05,
"loss": 3.1581,
"step": 5660
},
{
"epoch": 2.065950081982146,
"grad_norm": 0.9375,
"learning_rate": 3.112244897959184e-05,
"loss": 3.0967,
"step": 5670
},
{
"epoch": 2.0695937329203864,
"grad_norm": 0.88671875,
"learning_rate": 3.1000971817298355e-05,
"loss": 3.0299,
"step": 5680
},
{
"epoch": 2.0732373838586264,
"grad_norm": 0.82421875,
"learning_rate": 3.087949465500486e-05,
"loss": 3.1771,
"step": 5690
},
{
"epoch": 2.0768810347968665,
"grad_norm": 0.98828125,
"learning_rate": 3.0758017492711373e-05,
"loss": 3.1248,
"step": 5700
},
{
"epoch": 2.0805246857351065,
"grad_norm": 0.9609375,
"learning_rate": 3.063654033041788e-05,
"loss": 3.1227,
"step": 5710
},
{
"epoch": 2.0841683366733466,
"grad_norm": 0.83203125,
"learning_rate": 3.0515063168124392e-05,
"loss": 3.1019,
"step": 5720
},
{
"epoch": 2.0878119876115866,
"grad_norm": 1.015625,
"learning_rate": 3.0393586005830905e-05,
"loss": 3.151,
"step": 5730
},
{
"epoch": 2.091455638549827,
"grad_norm": 0.7890625,
"learning_rate": 3.0272108843537418e-05,
"loss": 3.0596,
"step": 5740
},
{
"epoch": 2.095099289488067,
"grad_norm": 0.84765625,
"learning_rate": 3.015063168124393e-05,
"loss": 3.1311,
"step": 5750
},
{
"epoch": 2.098742940426307,
"grad_norm": 0.8828125,
"learning_rate": 3.0029154518950437e-05,
"loss": 3.1926,
"step": 5760
},
{
"epoch": 2.102386591364547,
"grad_norm": 1.03125,
"learning_rate": 2.990767735665695e-05,
"loss": 3.1265,
"step": 5770
},
{
"epoch": 2.1060302423027872,
"grad_norm": 0.96875,
"learning_rate": 2.9786200194363463e-05,
"loss": 3.1637,
"step": 5780
},
{
"epoch": 2.1096738932410277,
"grad_norm": 0.9296875,
"learning_rate": 2.9664723032069976e-05,
"loss": 3.135,
"step": 5790
},
{
"epoch": 2.1133175441792678,
"grad_norm": 0.90234375,
"learning_rate": 2.954324586977648e-05,
"loss": 2.9778,
"step": 5800
},
{
"epoch": 2.116961195117508,
"grad_norm": 0.87890625,
"learning_rate": 2.9421768707482994e-05,
"loss": 3.1818,
"step": 5810
},
{
"epoch": 2.120604846055748,
"grad_norm": 0.8125,
"learning_rate": 2.9300291545189507e-05,
"loss": 3.0939,
"step": 5820
},
{
"epoch": 2.124248496993988,
"grad_norm": 0.94140625,
"learning_rate": 2.917881438289602e-05,
"loss": 3.1183,
"step": 5830
},
{
"epoch": 2.127892147932228,
"grad_norm": 0.8515625,
"learning_rate": 2.9057337220602526e-05,
"loss": 3.1304,
"step": 5840
},
{
"epoch": 2.1315357988704684,
"grad_norm": 0.8671875,
"learning_rate": 2.893586005830904e-05,
"loss": 3.0758,
"step": 5850
},
{
"epoch": 2.1351794498087084,
"grad_norm": 0.91015625,
"learning_rate": 2.8814382896015552e-05,
"loss": 3.0331,
"step": 5860
},
{
"epoch": 2.1388231007469485,
"grad_norm": 1.0234375,
"learning_rate": 2.8692905733722065e-05,
"loss": 3.1495,
"step": 5870
},
{
"epoch": 2.1424667516851885,
"grad_norm": 0.77734375,
"learning_rate": 2.857142857142857e-05,
"loss": 3.0309,
"step": 5880
},
{
"epoch": 2.1461104026234286,
"grad_norm": 0.8984375,
"learning_rate": 2.8449951409135084e-05,
"loss": 3.1425,
"step": 5890
},
{
"epoch": 2.1497540535616686,
"grad_norm": 0.890625,
"learning_rate": 2.8328474246841597e-05,
"loss": 3.0591,
"step": 5900
},
{
"epoch": 2.153397704499909,
"grad_norm": 1.1796875,
"learning_rate": 2.820699708454811e-05,
"loss": 3.1644,
"step": 5910
},
{
"epoch": 2.157041355438149,
"grad_norm": 0.92578125,
"learning_rate": 2.8085519922254615e-05,
"loss": 3.0664,
"step": 5920
},
{
"epoch": 2.160685006376389,
"grad_norm": 0.94921875,
"learning_rate": 2.796404275996113e-05,
"loss": 3.1222,
"step": 5930
},
{
"epoch": 2.164328657314629,
"grad_norm": 0.875,
"learning_rate": 2.784256559766764e-05,
"loss": 3.1721,
"step": 5940
},
{
"epoch": 2.1679723082528692,
"grad_norm": 1.046875,
"learning_rate": 2.7721088435374147e-05,
"loss": 3.2084,
"step": 5950
},
{
"epoch": 2.1716159591911097,
"grad_norm": 0.82421875,
"learning_rate": 2.759961127308066e-05,
"loss": 3.1617,
"step": 5960
},
{
"epoch": 2.1752596101293498,
"grad_norm": 0.8515625,
"learning_rate": 2.7478134110787173e-05,
"loss": 3.1179,
"step": 5970
},
{
"epoch": 2.17890326106759,
"grad_norm": 0.9765625,
"learning_rate": 2.7356656948493686e-05,
"loss": 3.09,
"step": 5980
},
{
"epoch": 2.18254691200583,
"grad_norm": 0.9296875,
"learning_rate": 2.7235179786200192e-05,
"loss": 3.0906,
"step": 5990
},
{
"epoch": 2.18619056294407,
"grad_norm": 0.83203125,
"learning_rate": 2.7113702623906705e-05,
"loss": 3.118,
"step": 6000
},
{
"epoch": 2.18983421388231,
"grad_norm": 0.953125,
"learning_rate": 2.6992225461613218e-05,
"loss": 3.0609,
"step": 6010
},
{
"epoch": 2.19347786482055,
"grad_norm": 0.9609375,
"learning_rate": 2.687074829931973e-05,
"loss": 3.1233,
"step": 6020
},
{
"epoch": 2.1971215157587904,
"grad_norm": 0.85546875,
"learning_rate": 2.674927113702624e-05,
"loss": 3.1228,
"step": 6030
},
{
"epoch": 2.2007651666970305,
"grad_norm": 0.828125,
"learning_rate": 2.662779397473275e-05,
"loss": 3.0195,
"step": 6040
},
{
"epoch": 2.2044088176352705,
"grad_norm": 0.921875,
"learning_rate": 2.6506316812439262e-05,
"loss": 3.1056,
"step": 6050
},
{
"epoch": 2.2080524685735106,
"grad_norm": 0.921875,
"learning_rate": 2.6384839650145775e-05,
"loss": 3.0823,
"step": 6060
},
{
"epoch": 2.2116961195117506,
"grad_norm": 0.97265625,
"learning_rate": 2.6263362487852285e-05,
"loss": 3.0095,
"step": 6070
},
{
"epoch": 2.215339770449991,
"grad_norm": 0.98828125,
"learning_rate": 2.6141885325558797e-05,
"loss": 3.1279,
"step": 6080
},
{
"epoch": 2.218983421388231,
"grad_norm": 0.875,
"learning_rate": 2.6020408163265307e-05,
"loss": 3.0758,
"step": 6090
},
{
"epoch": 2.222627072326471,
"grad_norm": 0.98828125,
"learning_rate": 2.589893100097182e-05,
"loss": 3.0668,
"step": 6100
},
{
"epoch": 2.226270723264711,
"grad_norm": 1.1171875,
"learning_rate": 2.577745383867833e-05,
"loss": 3.0601,
"step": 6110
},
{
"epoch": 2.2299143742029512,
"grad_norm": 0.94140625,
"learning_rate": 2.5655976676384842e-05,
"loss": 3.0568,
"step": 6120
},
{
"epoch": 2.2335580251411913,
"grad_norm": 0.90234375,
"learning_rate": 2.5534499514091355e-05,
"loss": 3.1399,
"step": 6130
},
{
"epoch": 2.2372016760794318,
"grad_norm": 0.9375,
"learning_rate": 2.541302235179786e-05,
"loss": 3.1351,
"step": 6140
},
{
"epoch": 2.240845327017672,
"grad_norm": 1.078125,
"learning_rate": 2.5291545189504374e-05,
"loss": 3.0368,
"step": 6150
},
{
"epoch": 2.244488977955912,
"grad_norm": 1.046875,
"learning_rate": 2.5170068027210887e-05,
"loss": 3.1425,
"step": 6160
},
{
"epoch": 2.248132628894152,
"grad_norm": 0.8984375,
"learning_rate": 2.50485908649174e-05,
"loss": 3.1455,
"step": 6170
},
{
"epoch": 2.251776279832392,
"grad_norm": 0.9453125,
"learning_rate": 2.492711370262391e-05,
"loss": 3.1195,
"step": 6180
},
{
"epoch": 2.255419930770632,
"grad_norm": 0.953125,
"learning_rate": 2.480563654033042e-05,
"loss": 3.1929,
"step": 6190
},
{
"epoch": 2.2590635817088724,
"grad_norm": 0.87109375,
"learning_rate": 2.468415937803693e-05,
"loss": 3.0456,
"step": 6200
},
{
"epoch": 2.2627072326471125,
"grad_norm": 1.1328125,
"learning_rate": 2.456268221574344e-05,
"loss": 3.1606,
"step": 6210
},
{
"epoch": 2.2663508835853525,
"grad_norm": 0.83984375,
"learning_rate": 2.4441205053449954e-05,
"loss": 3.054,
"step": 6220
},
{
"epoch": 2.2699945345235926,
"grad_norm": 1.0,
"learning_rate": 2.4319727891156463e-05,
"loss": 3.17,
"step": 6230
},
{
"epoch": 2.2736381854618326,
"grad_norm": 1.09375,
"learning_rate": 2.4198250728862976e-05,
"loss": 3.0642,
"step": 6240
},
{
"epoch": 2.277281836400073,
"grad_norm": 0.7734375,
"learning_rate": 2.4076773566569485e-05,
"loss": 2.9784,
"step": 6250
},
{
"epoch": 2.280925487338313,
"grad_norm": 0.96875,
"learning_rate": 2.3955296404275998e-05,
"loss": 3.0481,
"step": 6260
},
{
"epoch": 2.284569138276553,
"grad_norm": 0.91796875,
"learning_rate": 2.3833819241982508e-05,
"loss": 3.1128,
"step": 6270
},
{
"epoch": 2.288212789214793,
"grad_norm": 0.9296875,
"learning_rate": 2.371234207968902e-05,
"loss": 3.0554,
"step": 6280
},
{
"epoch": 2.2918564401530332,
"grad_norm": 1.1953125,
"learning_rate": 2.359086491739553e-05,
"loss": 3.1442,
"step": 6290
},
{
"epoch": 2.2955000910912733,
"grad_norm": 0.86328125,
"learning_rate": 2.3469387755102043e-05,
"loss": 3.1732,
"step": 6300
},
{
"epoch": 2.2991437420295133,
"grad_norm": 0.90625,
"learning_rate": 2.3347910592808552e-05,
"loss": 3.1065,
"step": 6310
},
{
"epoch": 2.302787392967754,
"grad_norm": 0.90234375,
"learning_rate": 2.3226433430515065e-05,
"loss": 3.1013,
"step": 6320
},
{
"epoch": 2.306431043905994,
"grad_norm": 0.859375,
"learning_rate": 2.3104956268221575e-05,
"loss": 3.1159,
"step": 6330
},
{
"epoch": 2.310074694844234,
"grad_norm": 0.953125,
"learning_rate": 2.2983479105928087e-05,
"loss": 3.0996,
"step": 6340
},
{
"epoch": 2.313718345782474,
"grad_norm": 0.85546875,
"learning_rate": 2.2862001943634597e-05,
"loss": 3.1101,
"step": 6350
},
{
"epoch": 2.317361996720714,
"grad_norm": 1.0546875,
"learning_rate": 2.2740524781341106e-05,
"loss": 3.1715,
"step": 6360
},
{
"epoch": 2.3210056476589545,
"grad_norm": 0.890625,
"learning_rate": 2.261904761904762e-05,
"loss": 3.0314,
"step": 6370
},
{
"epoch": 2.3246492985971945,
"grad_norm": 0.9921875,
"learning_rate": 2.249757045675413e-05,
"loss": 3.1542,
"step": 6380
},
{
"epoch": 2.3282929495354345,
"grad_norm": 0.8984375,
"learning_rate": 2.237609329446064e-05,
"loss": 3.1521,
"step": 6390
},
{
"epoch": 2.3319366004736746,
"grad_norm": 0.90625,
"learning_rate": 2.225461613216715e-05,
"loss": 3.1132,
"step": 6400
},
{
"epoch": 2.3355802514119146,
"grad_norm": 0.94921875,
"learning_rate": 2.2133138969873664e-05,
"loss": 3.0016,
"step": 6410
},
{
"epoch": 2.339223902350155,
"grad_norm": 0.9921875,
"learning_rate": 2.2011661807580177e-05,
"loss": 3.1012,
"step": 6420
},
{
"epoch": 2.342867553288395,
"grad_norm": 0.9375,
"learning_rate": 2.1890184645286686e-05,
"loss": 3.0911,
"step": 6430
},
{
"epoch": 2.346511204226635,
"grad_norm": 0.8984375,
"learning_rate": 2.17687074829932e-05,
"loss": 3.0734,
"step": 6440
},
{
"epoch": 2.350154855164875,
"grad_norm": 1.03125,
"learning_rate": 2.1647230320699712e-05,
"loss": 3.1034,
"step": 6450
},
{
"epoch": 2.3537985061031153,
"grad_norm": 0.90234375,
"learning_rate": 2.152575315840622e-05,
"loss": 3.1082,
"step": 6460
},
{
"epoch": 2.3574421570413553,
"grad_norm": 0.76953125,
"learning_rate": 2.1404275996112734e-05,
"loss": 3.078,
"step": 6470
},
{
"epoch": 2.3610858079795953,
"grad_norm": 0.828125,
"learning_rate": 2.1282798833819244e-05,
"loss": 3.077,
"step": 6480
},
{
"epoch": 2.364729458917836,
"grad_norm": 0.91015625,
"learning_rate": 2.1161321671525756e-05,
"loss": 3.0717,
"step": 6490
},
{
"epoch": 2.368373109856076,
"grad_norm": 0.91015625,
"learning_rate": 2.1039844509232266e-05,
"loss": 3.0983,
"step": 6500
},
{
"epoch": 2.372016760794316,
"grad_norm": 0.890625,
"learning_rate": 2.091836734693878e-05,
"loss": 3.0621,
"step": 6510
},
{
"epoch": 2.375660411732556,
"grad_norm": 1.015625,
"learning_rate": 2.0796890184645288e-05,
"loss": 3.0199,
"step": 6520
},
{
"epoch": 2.379304062670796,
"grad_norm": 0.94921875,
"learning_rate": 2.06754130223518e-05,
"loss": 3.1456,
"step": 6530
},
{
"epoch": 2.3829477136090365,
"grad_norm": 1.125,
"learning_rate": 2.055393586005831e-05,
"loss": 3.0592,
"step": 6540
},
{
"epoch": 2.3865913645472765,
"grad_norm": 0.96875,
"learning_rate": 2.0432458697764823e-05,
"loss": 3.0988,
"step": 6550
},
{
"epoch": 2.3902350154855165,
"grad_norm": 0.875,
"learning_rate": 2.0310981535471333e-05,
"loss": 3.1426,
"step": 6560
},
{
"epoch": 2.3938786664237566,
"grad_norm": 1.03125,
"learning_rate": 2.0189504373177842e-05,
"loss": 3.0916,
"step": 6570
},
{
"epoch": 2.3975223173619966,
"grad_norm": 0.92578125,
"learning_rate": 2.0068027210884355e-05,
"loss": 3.1088,
"step": 6580
},
{
"epoch": 2.4011659683002367,
"grad_norm": 0.91796875,
"learning_rate": 1.9946550048590865e-05,
"loss": 3.0977,
"step": 6590
},
{
"epoch": 2.404809619238477,
"grad_norm": 0.875,
"learning_rate": 1.9825072886297377e-05,
"loss": 3.1589,
"step": 6600
},
{
"epoch": 2.408453270176717,
"grad_norm": 0.9375,
"learning_rate": 1.9703595724003887e-05,
"loss": 3.0655,
"step": 6610
},
{
"epoch": 2.412096921114957,
"grad_norm": 0.95703125,
"learning_rate": 1.95821185617104e-05,
"loss": 3.085,
"step": 6620
},
{
"epoch": 2.4157405720531973,
"grad_norm": 1.0,
"learning_rate": 1.946064139941691e-05,
"loss": 3.0235,
"step": 6630
},
{
"epoch": 2.4193842229914373,
"grad_norm": 1.015625,
"learning_rate": 1.9339164237123422e-05,
"loss": 3.1214,
"step": 6640
},
{
"epoch": 2.4230278739296773,
"grad_norm": 0.8671875,
"learning_rate": 1.921768707482993e-05,
"loss": 3.1466,
"step": 6650
},
{
"epoch": 2.426671524867918,
"grad_norm": 1.1640625,
"learning_rate": 1.9096209912536444e-05,
"loss": 3.0947,
"step": 6660
},
{
"epoch": 2.430315175806158,
"grad_norm": 0.984375,
"learning_rate": 1.8974732750242954e-05,
"loss": 3.1565,
"step": 6670
},
{
"epoch": 2.433958826744398,
"grad_norm": 0.9921875,
"learning_rate": 1.8853255587949467e-05,
"loss": 3.1209,
"step": 6680
},
{
"epoch": 2.437602477682638,
"grad_norm": 0.80078125,
"learning_rate": 1.8731778425655976e-05,
"loss": 3.0784,
"step": 6690
},
{
"epoch": 2.441246128620878,
"grad_norm": 0.828125,
"learning_rate": 1.861030126336249e-05,
"loss": 3.1211,
"step": 6700
},
{
"epoch": 2.4448897795591185,
"grad_norm": 0.828125,
"learning_rate": 1.8488824101069e-05,
"loss": 3.1112,
"step": 6710
},
{
"epoch": 2.4485334304973585,
"grad_norm": 0.9375,
"learning_rate": 1.836734693877551e-05,
"loss": 3.1023,
"step": 6720
},
{
"epoch": 2.4521770814355985,
"grad_norm": 0.98046875,
"learning_rate": 1.824586977648202e-05,
"loss": 3.1156,
"step": 6730
},
{
"epoch": 2.4558207323738386,
"grad_norm": 0.9140625,
"learning_rate": 1.8124392614188534e-05,
"loss": 3.104,
"step": 6740
},
{
"epoch": 2.4594643833120786,
"grad_norm": 0.90234375,
"learning_rate": 1.8002915451895043e-05,
"loss": 3.1426,
"step": 6750
},
{
"epoch": 2.4631080342503187,
"grad_norm": 0.80859375,
"learning_rate": 1.7881438289601556e-05,
"loss": 3.126,
"step": 6760
},
{
"epoch": 2.4667516851885587,
"grad_norm": 0.953125,
"learning_rate": 1.7759961127308065e-05,
"loss": 3.104,
"step": 6770
},
{
"epoch": 2.470395336126799,
"grad_norm": 0.8359375,
"learning_rate": 1.7638483965014578e-05,
"loss": 3.1772,
"step": 6780
},
{
"epoch": 2.474038987065039,
"grad_norm": 1.0078125,
"learning_rate": 1.7517006802721088e-05,
"loss": 3.0617,
"step": 6790
},
{
"epoch": 2.4776826380032793,
"grad_norm": 1.0859375,
"learning_rate": 1.73955296404276e-05,
"loss": 3.0941,
"step": 6800
},
{
"epoch": 2.4813262889415193,
"grad_norm": 1.0625,
"learning_rate": 1.7274052478134113e-05,
"loss": 3.1459,
"step": 6810
},
{
"epoch": 2.4849699398797593,
"grad_norm": 0.9609375,
"learning_rate": 1.7152575315840623e-05,
"loss": 3.1183,
"step": 6820
},
{
"epoch": 2.488613590818,
"grad_norm": 1.25,
"learning_rate": 1.7031098153547136e-05,
"loss": 3.1467,
"step": 6830
},
{
"epoch": 2.49225724175624,
"grad_norm": 1.03125,
"learning_rate": 1.6909620991253645e-05,
"loss": 3.1645,
"step": 6840
},
{
"epoch": 2.49590089269448,
"grad_norm": 1.171875,
"learning_rate": 1.6788143828960158e-05,
"loss": 3.1622,
"step": 6850
},
{
"epoch": 2.49954454363272,
"grad_norm": 0.97265625,
"learning_rate": 1.6666666666666667e-05,
"loss": 3.1339,
"step": 6860
},
{
"epoch": 2.50318819457096,
"grad_norm": 0.93359375,
"learning_rate": 1.654518950437318e-05,
"loss": 3.0641,
"step": 6870
},
{
"epoch": 2.5068318455092005,
"grad_norm": 0.8984375,
"learning_rate": 1.642371234207969e-05,
"loss": 3.1318,
"step": 6880
},
{
"epoch": 2.51047549644744,
"grad_norm": 0.8671875,
"learning_rate": 1.6302235179786203e-05,
"loss": 3.1287,
"step": 6890
},
{
"epoch": 2.5141191473856805,
"grad_norm": 0.8203125,
"learning_rate": 1.6180758017492712e-05,
"loss": 3.0884,
"step": 6900
},
{
"epoch": 2.5177627983239206,
"grad_norm": 0.83203125,
"learning_rate": 1.6059280855199225e-05,
"loss": 3.1332,
"step": 6910
},
{
"epoch": 2.5214064492621606,
"grad_norm": 0.9375,
"learning_rate": 1.5937803692905734e-05,
"loss": 3.129,
"step": 6920
},
{
"epoch": 2.5250501002004007,
"grad_norm": 1.125,
"learning_rate": 1.5816326530612247e-05,
"loss": 3.134,
"step": 6930
},
{
"epoch": 2.5286937511386407,
"grad_norm": 0.93359375,
"learning_rate": 1.5694849368318757e-05,
"loss": 3.0906,
"step": 6940
},
{
"epoch": 2.532337402076881,
"grad_norm": 1.09375,
"learning_rate": 1.557337220602527e-05,
"loss": 3.1158,
"step": 6950
},
{
"epoch": 2.5359810530151212,
"grad_norm": 0.80859375,
"learning_rate": 1.545189504373178e-05,
"loss": 3.0513,
"step": 6960
},
{
"epoch": 2.5396247039533613,
"grad_norm": 0.87890625,
"learning_rate": 1.5330417881438292e-05,
"loss": 3.0564,
"step": 6970
},
{
"epoch": 2.5432683548916013,
"grad_norm": 0.9140625,
"learning_rate": 1.5208940719144801e-05,
"loss": 3.1163,
"step": 6980
},
{
"epoch": 2.5469120058298413,
"grad_norm": 0.83984375,
"learning_rate": 1.5087463556851314e-05,
"loss": 3.0554,
"step": 6990
},
{
"epoch": 2.550555656768082,
"grad_norm": 0.94921875,
"learning_rate": 1.4965986394557824e-05,
"loss": 3.1519,
"step": 7000
},
{
"epoch": 2.554199307706322,
"grad_norm": 0.96484375,
"learning_rate": 1.4844509232264333e-05,
"loss": 3.0809,
"step": 7010
},
{
"epoch": 2.557842958644562,
"grad_norm": 0.90234375,
"learning_rate": 1.4723032069970846e-05,
"loss": 3.0894,
"step": 7020
},
{
"epoch": 2.561486609582802,
"grad_norm": 1.1015625,
"learning_rate": 1.4601554907677355e-05,
"loss": 3.2001,
"step": 7030
},
{
"epoch": 2.565130260521042,
"grad_norm": 1.046875,
"learning_rate": 1.4480077745383868e-05,
"loss": 3.0819,
"step": 7040
},
{
"epoch": 2.5687739114592825,
"grad_norm": 1.2109375,
"learning_rate": 1.435860058309038e-05,
"loss": 3.084,
"step": 7050
},
{
"epoch": 2.572417562397522,
"grad_norm": 0.9296875,
"learning_rate": 1.423712342079689e-05,
"loss": 3.0735,
"step": 7060
},
{
"epoch": 2.5760612133357625,
"grad_norm": 1.0625,
"learning_rate": 1.4115646258503402e-05,
"loss": 3.1273,
"step": 7070
},
{
"epoch": 2.5797048642740026,
"grad_norm": 0.91015625,
"learning_rate": 1.3994169096209913e-05,
"loss": 3.1316,
"step": 7080
},
{
"epoch": 2.5833485152122426,
"grad_norm": 0.96484375,
"learning_rate": 1.3872691933916424e-05,
"loss": 3.1375,
"step": 7090
},
{
"epoch": 2.5869921661504827,
"grad_norm": 0.91015625,
"learning_rate": 1.3751214771622937e-05,
"loss": 3.1154,
"step": 7100
},
{
"epoch": 2.5906358170887227,
"grad_norm": 0.92578125,
"learning_rate": 1.3629737609329446e-05,
"loss": 3.1512,
"step": 7110
},
{
"epoch": 2.594279468026963,
"grad_norm": 0.9140625,
"learning_rate": 1.350826044703596e-05,
"loss": 3.087,
"step": 7120
},
{
"epoch": 2.5979231189652032,
"grad_norm": 1.046875,
"learning_rate": 1.3386783284742469e-05,
"loss": 3.1213,
"step": 7130
},
{
"epoch": 2.6015667699034433,
"grad_norm": 0.92578125,
"learning_rate": 1.3265306122448982e-05,
"loss": 3.1229,
"step": 7140
},
{
"epoch": 2.6052104208416833,
"grad_norm": 1.03125,
"learning_rate": 1.3143828960155491e-05,
"loss": 3.1252,
"step": 7150
},
{
"epoch": 2.6088540717799233,
"grad_norm": 0.890625,
"learning_rate": 1.3022351797862004e-05,
"loss": 3.0816,
"step": 7160
},
{
"epoch": 2.612497722718164,
"grad_norm": 1.109375,
"learning_rate": 1.2900874635568513e-05,
"loss": 3.1299,
"step": 7170
},
{
"epoch": 2.616141373656404,
"grad_norm": 0.92578125,
"learning_rate": 1.2779397473275026e-05,
"loss": 3.0499,
"step": 7180
},
{
"epoch": 2.619785024594644,
"grad_norm": 1.1640625,
"learning_rate": 1.2657920310981536e-05,
"loss": 3.1844,
"step": 7190
},
{
"epoch": 2.623428675532884,
"grad_norm": 0.83203125,
"learning_rate": 1.2536443148688048e-05,
"loss": 3.148,
"step": 7200
},
{
"epoch": 2.627072326471124,
"grad_norm": 0.90625,
"learning_rate": 1.2414965986394558e-05,
"loss": 3.0752,
"step": 7210
},
{
"epoch": 2.630715977409364,
"grad_norm": 0.99609375,
"learning_rate": 1.2293488824101069e-05,
"loss": 3.1968,
"step": 7220
},
{
"epoch": 2.634359628347604,
"grad_norm": 1.125,
"learning_rate": 1.217201166180758e-05,
"loss": 3.2027,
"step": 7230
},
{
"epoch": 2.6380032792858445,
"grad_norm": 0.890625,
"learning_rate": 1.2050534499514091e-05,
"loss": 3.1299,
"step": 7240
},
{
"epoch": 2.6416469302240846,
"grad_norm": 0.92578125,
"learning_rate": 1.1929057337220603e-05,
"loss": 3.1109,
"step": 7250
},
{
"epoch": 2.6452905811623246,
"grad_norm": 0.8515625,
"learning_rate": 1.1807580174927114e-05,
"loss": 3.1591,
"step": 7260
},
{
"epoch": 2.6489342321005647,
"grad_norm": 1.0703125,
"learning_rate": 1.1686103012633627e-05,
"loss": 3.0862,
"step": 7270
},
{
"epoch": 2.6525778830388047,
"grad_norm": 0.87890625,
"learning_rate": 1.1564625850340138e-05,
"loss": 3.1458,
"step": 7280
},
{
"epoch": 2.656221533977045,
"grad_norm": 1.1171875,
"learning_rate": 1.1443148688046649e-05,
"loss": 3.1476,
"step": 7290
},
{
"epoch": 2.6598651849152852,
"grad_norm": 0.99609375,
"learning_rate": 1.132167152575316e-05,
"loss": 3.136,
"step": 7300
},
{
"epoch": 2.6635088358535253,
"grad_norm": 0.87109375,
"learning_rate": 1.1200194363459671e-05,
"loss": 3.0971,
"step": 7310
},
{
"epoch": 2.6671524867917653,
"grad_norm": 0.95703125,
"learning_rate": 1.1078717201166182e-05,
"loss": 3.1187,
"step": 7320
},
{
"epoch": 2.6707961377300053,
"grad_norm": 0.88671875,
"learning_rate": 1.0957240038872693e-05,
"loss": 3.1171,
"step": 7330
},
{
"epoch": 2.674439788668246,
"grad_norm": 0.9453125,
"learning_rate": 1.0835762876579203e-05,
"loss": 3.1523,
"step": 7340
},
{
"epoch": 2.6780834396064854,
"grad_norm": 0.94921875,
"learning_rate": 1.0714285714285714e-05,
"loss": 3.1301,
"step": 7350
},
{
"epoch": 2.681727090544726,
"grad_norm": 1.0390625,
"learning_rate": 1.0592808551992225e-05,
"loss": 3.1293,
"step": 7360
},
{
"epoch": 2.685370741482966,
"grad_norm": 0.96875,
"learning_rate": 1.0471331389698736e-05,
"loss": 3.1171,
"step": 7370
},
{
"epoch": 2.689014392421206,
"grad_norm": 1.0859375,
"learning_rate": 1.0349854227405248e-05,
"loss": 3.0375,
"step": 7380
},
{
"epoch": 2.692658043359446,
"grad_norm": 1.0390625,
"learning_rate": 1.0228377065111759e-05,
"loss": 3.0265,
"step": 7390
},
{
"epoch": 2.696301694297686,
"grad_norm": 0.94921875,
"learning_rate": 1.010689990281827e-05,
"loss": 3.097,
"step": 7400
},
{
"epoch": 2.6999453452359266,
"grad_norm": 0.83984375,
"learning_rate": 9.985422740524781e-06,
"loss": 3.0494,
"step": 7410
},
{
"epoch": 2.7035889961741666,
"grad_norm": 0.9140625,
"learning_rate": 9.863945578231292e-06,
"loss": 3.0811,
"step": 7420
},
{
"epoch": 2.7072326471124066,
"grad_norm": 0.96484375,
"learning_rate": 9.742468415937803e-06,
"loss": 3.1214,
"step": 7430
},
{
"epoch": 2.7108762980506467,
"grad_norm": 0.96875,
"learning_rate": 9.620991253644314e-06,
"loss": 3.1006,
"step": 7440
},
{
"epoch": 2.7145199489888867,
"grad_norm": 0.99609375,
"learning_rate": 9.499514091350827e-06,
"loss": 3.1645,
"step": 7450
},
{
"epoch": 2.718163599927127,
"grad_norm": 0.96484375,
"learning_rate": 9.378036929057338e-06,
"loss": 3.07,
"step": 7460
},
{
"epoch": 2.7218072508653672,
"grad_norm": 0.8515625,
"learning_rate": 9.25655976676385e-06,
"loss": 3.1343,
"step": 7470
},
{
"epoch": 2.7254509018036073,
"grad_norm": 1.015625,
"learning_rate": 9.13508260447036e-06,
"loss": 3.1049,
"step": 7480
},
{
"epoch": 2.7290945527418473,
"grad_norm": 0.90234375,
"learning_rate": 9.013605442176872e-06,
"loss": 3.1182,
"step": 7490
},
{
"epoch": 2.7327382036800874,
"grad_norm": 0.84765625,
"learning_rate": 8.892128279883383e-06,
"loss": 3.063,
"step": 7500
},
{
"epoch": 2.736381854618328,
"grad_norm": 0.9453125,
"learning_rate": 8.770651117589894e-06,
"loss": 3.0678,
"step": 7510
},
{
"epoch": 2.7400255055565674,
"grad_norm": 0.84765625,
"learning_rate": 8.649173955296405e-06,
"loss": 3.1132,
"step": 7520
},
{
"epoch": 2.743669156494808,
"grad_norm": 0.9765625,
"learning_rate": 8.527696793002917e-06,
"loss": 3.0649,
"step": 7530
},
{
"epoch": 2.747312807433048,
"grad_norm": 0.9140625,
"learning_rate": 8.406219630709428e-06,
"loss": 3.0386,
"step": 7540
},
{
"epoch": 2.750956458371288,
"grad_norm": 0.96484375,
"learning_rate": 8.284742468415939e-06,
"loss": 3.0972,
"step": 7550
},
{
"epoch": 2.754600109309528,
"grad_norm": 1.0703125,
"learning_rate": 8.163265306122448e-06,
"loss": 3.1145,
"step": 7560
},
{
"epoch": 2.758243760247768,
"grad_norm": 0.94140625,
"learning_rate": 8.04178814382896e-06,
"loss": 3.1053,
"step": 7570
},
{
"epoch": 2.7618874111860086,
"grad_norm": 0.95703125,
"learning_rate": 7.92031098153547e-06,
"loss": 3.1086,
"step": 7580
},
{
"epoch": 2.7655310621242486,
"grad_norm": 0.875,
"learning_rate": 7.798833819241982e-06,
"loss": 3.0831,
"step": 7590
},
{
"epoch": 2.7691747130624886,
"grad_norm": 1.015625,
"learning_rate": 7.677356656948493e-06,
"loss": 3.1135,
"step": 7600
},
{
"epoch": 2.7728183640007287,
"grad_norm": 0.921875,
"learning_rate": 7.555879494655005e-06,
"loss": 3.0605,
"step": 7610
},
{
"epoch": 2.7764620149389687,
"grad_norm": 0.96484375,
"learning_rate": 7.434402332361516e-06,
"loss": 2.9854,
"step": 7620
},
{
"epoch": 2.780105665877209,
"grad_norm": 1.078125,
"learning_rate": 7.312925170068027e-06,
"loss": 3.15,
"step": 7630
},
{
"epoch": 2.7837493168154492,
"grad_norm": 0.921875,
"learning_rate": 7.191448007774538e-06,
"loss": 3.1166,
"step": 7640
},
{
"epoch": 2.7873929677536893,
"grad_norm": 0.9375,
"learning_rate": 7.06997084548105e-06,
"loss": 3.0783,
"step": 7650
},
{
"epoch": 2.7910366186919293,
"grad_norm": 0.796875,
"learning_rate": 6.948493683187561e-06,
"loss": 3.0845,
"step": 7660
},
{
"epoch": 2.7946802696301694,
"grad_norm": 1.015625,
"learning_rate": 6.827016520894072e-06,
"loss": 3.0717,
"step": 7670
},
{
"epoch": 2.7983239205684094,
"grad_norm": 1.109375,
"learning_rate": 6.705539358600584e-06,
"loss": 3.0456,
"step": 7680
},
{
"epoch": 2.8019675715066494,
"grad_norm": 0.890625,
"learning_rate": 6.584062196307095e-06,
"loss": 3.071,
"step": 7690
},
{
"epoch": 2.80561122244489,
"grad_norm": 1.0390625,
"learning_rate": 6.462585034013606e-06,
"loss": 3.063,
"step": 7700
},
{
"epoch": 2.80925487338313,
"grad_norm": 1.0546875,
"learning_rate": 6.341107871720117e-06,
"loss": 3.1031,
"step": 7710
},
{
"epoch": 2.81289852432137,
"grad_norm": 0.97265625,
"learning_rate": 6.219630709426628e-06,
"loss": 3.0297,
"step": 7720
},
{
"epoch": 2.81654217525961,
"grad_norm": 0.92578125,
"learning_rate": 6.098153547133139e-06,
"loss": 3.119,
"step": 7730
},
{
"epoch": 2.82018582619785,
"grad_norm": 0.92578125,
"learning_rate": 5.97667638483965e-06,
"loss": 3.0535,
"step": 7740
},
{
"epoch": 2.8238294771360906,
"grad_norm": 0.81640625,
"learning_rate": 5.855199222546161e-06,
"loss": 3.1086,
"step": 7750
},
{
"epoch": 2.8274731280743306,
"grad_norm": 0.9609375,
"learning_rate": 5.733722060252672e-06,
"loss": 3.133,
"step": 7760
},
{
"epoch": 2.8311167790125706,
"grad_norm": 1.0625,
"learning_rate": 5.612244897959184e-06,
"loss": 3.1374,
"step": 7770
},
{
"epoch": 2.8347604299508107,
"grad_norm": 0.9453125,
"learning_rate": 5.4907677356656954e-06,
"loss": 3.1706,
"step": 7780
},
{
"epoch": 2.8384040808890507,
"grad_norm": 0.9453125,
"learning_rate": 5.369290573372207e-06,
"loss": 3.0924,
"step": 7790
},
{
"epoch": 2.842047731827291,
"grad_norm": 0.87890625,
"learning_rate": 5.247813411078718e-06,
"loss": 3.0695,
"step": 7800
},
{
"epoch": 2.845691382765531,
"grad_norm": 0.89453125,
"learning_rate": 5.126336248785229e-06,
"loss": 3.0492,
"step": 7810
},
{
"epoch": 2.8493350337037713,
"grad_norm": 0.83984375,
"learning_rate": 5.00485908649174e-06,
"loss": 3.0992,
"step": 7820
},
{
"epoch": 2.8529786846420113,
"grad_norm": 0.875,
"learning_rate": 4.88338192419825e-06,
"loss": 3.0975,
"step": 7830
},
{
"epoch": 2.8566223355802514,
"grad_norm": 0.93359375,
"learning_rate": 4.7619047619047615e-06,
"loss": 3.1571,
"step": 7840
},
{
"epoch": 2.8602659865184914,
"grad_norm": 0.984375,
"learning_rate": 4.640427599611273e-06,
"loss": 3.1478,
"step": 7850
},
{
"epoch": 2.8639096374567314,
"grad_norm": 0.94140625,
"learning_rate": 4.518950437317785e-06,
"loss": 3.117,
"step": 7860
},
{
"epoch": 2.867553288394972,
"grad_norm": 0.94921875,
"learning_rate": 4.397473275024296e-06,
"loss": 3.0144,
"step": 7870
},
{
"epoch": 2.871196939333212,
"grad_norm": 0.8828125,
"learning_rate": 4.275996112730807e-06,
"loss": 3.1565,
"step": 7880
},
{
"epoch": 2.874840590271452,
"grad_norm": 1.015625,
"learning_rate": 4.154518950437318e-06,
"loss": 3.2086,
"step": 7890
},
{
"epoch": 2.878484241209692,
"grad_norm": 0.82421875,
"learning_rate": 4.033041788143829e-06,
"loss": 3.124,
"step": 7900
},
{
"epoch": 2.882127892147932,
"grad_norm": 0.94921875,
"learning_rate": 3.9115646258503405e-06,
"loss": 3.046,
"step": 7910
},
{
"epoch": 2.8857715430861726,
"grad_norm": 0.8828125,
"learning_rate": 3.7900874635568516e-06,
"loss": 3.1214,
"step": 7920
},
{
"epoch": 2.8894151940244126,
"grad_norm": 0.91796875,
"learning_rate": 3.6686103012633628e-06,
"loss": 3.0823,
"step": 7930
},
{
"epoch": 2.8930588449626526,
"grad_norm": 0.765625,
"learning_rate": 3.5471331389698735e-06,
"loss": 3.0588,
"step": 7940
},
{
"epoch": 2.8967024959008927,
"grad_norm": 0.93359375,
"learning_rate": 3.4256559766763847e-06,
"loss": 3.1368,
"step": 7950
},
{
"epoch": 2.9003461468391327,
"grad_norm": 0.87890625,
"learning_rate": 3.304178814382896e-06,
"loss": 3.0578,
"step": 7960
},
{
"epoch": 2.903989797777373,
"grad_norm": 0.890625,
"learning_rate": 3.1827016520894074e-06,
"loss": 3.1724,
"step": 7970
},
{
"epoch": 2.907633448715613,
"grad_norm": 0.85546875,
"learning_rate": 3.0612244897959185e-06,
"loss": 3.1477,
"step": 7980
},
{
"epoch": 2.9112770996538533,
"grad_norm": 0.96484375,
"learning_rate": 2.9397473275024297e-06,
"loss": 3.1196,
"step": 7990
},
{
"epoch": 2.9149207505920933,
"grad_norm": 0.9765625,
"learning_rate": 2.818270165208941e-06,
"loss": 3.1087,
"step": 8000
}
],
"logging_steps": 10,
"max_steps": 8232,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.7504106874736026e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}