dumb-dev's picture
at least i generates some kind of output now...
8bb1997 verified
raw
history blame contribute delete
No virus
107 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 20000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032,
"grad_norm": 0.7325745224952698,
"learning_rate": 9.99993683466483e-05,
"loss": 1.3543,
"step": 32
},
{
"epoch": 0.0064,
"grad_norm": 0.8166712522506714,
"learning_rate": 9.999747340255259e-05,
"loss": 1.2915,
"step": 64
},
{
"epoch": 0.0096,
"grad_norm": 1.328125238418579,
"learning_rate": 9.999431521559082e-05,
"loss": 1.2584,
"step": 96
},
{
"epoch": 0.0128,
"grad_norm": 1.0592037439346313,
"learning_rate": 9.998989386555814e-05,
"loss": 1.2065,
"step": 128
},
{
"epoch": 0.016,
"grad_norm": 1.292716145515442,
"learning_rate": 9.9984209464165e-05,
"loss": 1.1616,
"step": 160
},
{
"epoch": 0.0192,
"grad_norm": 1.1350401639938354,
"learning_rate": 9.997726215503422e-05,
"loss": 1.1175,
"step": 192
},
{
"epoch": 0.0224,
"grad_norm": 1.0363103151321411,
"learning_rate": 9.996905211369748e-05,
"loss": 1.1306,
"step": 224
},
{
"epoch": 0.0256,
"grad_norm": 1.3373076915740967,
"learning_rate": 9.995957954759071e-05,
"loss": 1.1809,
"step": 256
},
{
"epoch": 0.0288,
"grad_norm": 1.1969462633132935,
"learning_rate": 9.994884469604912e-05,
"loss": 1.2335,
"step": 288
},
{
"epoch": 0.032,
"grad_norm": 1.1673309803009033,
"learning_rate": 9.993684783030088e-05,
"loss": 1.1687,
"step": 320
},
{
"epoch": 0.0352,
"grad_norm": 1.849138617515564,
"learning_rate": 9.99235892534604e-05,
"loss": 1.1873,
"step": 352
},
{
"epoch": 0.0384,
"grad_norm": 1.3689218759536743,
"learning_rate": 9.990906930052064e-05,
"loss": 1.093,
"step": 384
},
{
"epoch": 0.0416,
"grad_norm": 1.2357085943222046,
"learning_rate": 9.989328833834471e-05,
"loss": 1.1549,
"step": 416
},
{
"epoch": 0.0448,
"grad_norm": 1.5436211824417114,
"learning_rate": 9.987624676565652e-05,
"loss": 1.1943,
"step": 448
},
{
"epoch": 0.048,
"grad_norm": 1.1809171438217163,
"learning_rate": 9.98579450130307e-05,
"loss": 1.1305,
"step": 480
},
{
"epoch": 0.0512,
"grad_norm": 1.1661288738250732,
"learning_rate": 9.983838354288181e-05,
"loss": 1.0564,
"step": 512
},
{
"epoch": 0.0544,
"grad_norm": 2.7672770023345947,
"learning_rate": 9.981756284945256e-05,
"loss": 1.1576,
"step": 544
},
{
"epoch": 0.0576,
"grad_norm": 1.6773067712783813,
"learning_rate": 9.979548345880141e-05,
"loss": 1.0685,
"step": 576
},
{
"epoch": 0.0608,
"grad_norm": 1.1834300756454468,
"learning_rate": 9.977214592878916e-05,
"loss": 1.107,
"step": 608
},
{
"epoch": 0.064,
"grad_norm": 1.0704506635665894,
"learning_rate": 9.974755084906502e-05,
"loss": 1.1127,
"step": 640
},
{
"epoch": 0.0672,
"grad_norm": 2.3311216831207275,
"learning_rate": 9.972169884105153e-05,
"loss": 1.1119,
"step": 672
},
{
"epoch": 0.0704,
"grad_norm": 1.1934360265731812,
"learning_rate": 9.969459055792903e-05,
"loss": 1.1084,
"step": 704
},
{
"epoch": 0.0736,
"grad_norm": 1.8318133354187012,
"learning_rate": 9.9666226684619e-05,
"loss": 1.1249,
"step": 736
},
{
"epoch": 0.0768,
"grad_norm": 1.3005311489105225,
"learning_rate": 9.963660793776688e-05,
"loss": 1.0904,
"step": 768
},
{
"epoch": 0.08,
"grad_norm": 1.3241008520126343,
"learning_rate": 9.96057350657239e-05,
"loss": 1.0616,
"step": 800
},
{
"epoch": 0.0832,
"grad_norm": 2.055724620819092,
"learning_rate": 9.957360884852817e-05,
"loss": 1.1737,
"step": 832
},
{
"epoch": 0.0864,
"grad_norm": 1.22575044631958,
"learning_rate": 9.954023009788504e-05,
"loss": 1.0874,
"step": 864
},
{
"epoch": 0.0896,
"grad_norm": 1.0968865156173706,
"learning_rate": 9.950559965714648e-05,
"loss": 1.0928,
"step": 896
},
{
"epoch": 0.0928,
"grad_norm": 1.220700740814209,
"learning_rate": 9.946971840128981e-05,
"loss": 1.1083,
"step": 928
},
{
"epoch": 0.096,
"grad_norm": 2.3711702823638916,
"learning_rate": 9.94325872368957e-05,
"loss": 1.1401,
"step": 960
},
{
"epoch": 0.0992,
"grad_norm": 1.261365532875061,
"learning_rate": 9.939420710212511e-05,
"loss": 1.159,
"step": 992
},
{
"epoch": 0.1024,
"grad_norm": 1.2131611108779907,
"learning_rate": 9.935457896669568e-05,
"loss": 1.1364,
"step": 1024
},
{
"epoch": 0.1056,
"grad_norm": 1.0336030721664429,
"learning_rate": 9.931370383185718e-05,
"loss": 1.0561,
"step": 1056
},
{
"epoch": 0.1088,
"grad_norm": 2.2293291091918945,
"learning_rate": 9.927158273036625e-05,
"loss": 1.121,
"step": 1088
},
{
"epoch": 0.112,
"grad_norm": 0.9839213490486145,
"learning_rate": 9.922821672646027e-05,
"loss": 1.1557,
"step": 1120
},
{
"epoch": 0.1152,
"grad_norm": 1.3354933261871338,
"learning_rate": 9.918360691583056e-05,
"loss": 1.0198,
"step": 1152
},
{
"epoch": 0.1184,
"grad_norm": 1.2504842281341553,
"learning_rate": 9.913775442559452e-05,
"loss": 1.0997,
"step": 1184
},
{
"epoch": 0.1216,
"grad_norm": 1.236770749092102,
"learning_rate": 9.909066041426733e-05,
"loss": 1.1579,
"step": 1216
},
{
"epoch": 0.1248,
"grad_norm": 1.1531703472137451,
"learning_rate": 9.904232607173262e-05,
"loss": 1.1022,
"step": 1248
},
{
"epoch": 0.128,
"grad_norm": 1.2251778841018677,
"learning_rate": 9.899275261921234e-05,
"loss": 1.1239,
"step": 1280
},
{
"epoch": 0.1312,
"grad_norm": 1.0417462587356567,
"learning_rate": 9.894194130923602e-05,
"loss": 1.1896,
"step": 1312
},
{
"epoch": 0.1344,
"grad_norm": 1.3808753490447998,
"learning_rate": 9.888989342560899e-05,
"loss": 1.096,
"step": 1344
},
{
"epoch": 0.1376,
"grad_norm": 1.349967360496521,
"learning_rate": 9.883661028338008e-05,
"loss": 1.077,
"step": 1376
},
{
"epoch": 0.1408,
"grad_norm": 2.028076648712158,
"learning_rate": 9.87820932288083e-05,
"loss": 1.0932,
"step": 1408
},
{
"epoch": 0.144,
"grad_norm": 1.088742971420288,
"learning_rate": 9.872634363932887e-05,
"loss": 1.1665,
"step": 1440
},
{
"epoch": 0.1472,
"grad_norm": 1.1702725887298584,
"learning_rate": 9.866936292351836e-05,
"loss": 1.058,
"step": 1472
},
{
"epoch": 0.1504,
"grad_norm": 1.2243529558181763,
"learning_rate": 9.861115252105921e-05,
"loss": 1.131,
"step": 1504
},
{
"epoch": 0.1536,
"grad_norm": 1.4041122198104858,
"learning_rate": 9.855171390270324e-05,
"loss": 1.0844,
"step": 1536
},
{
"epoch": 0.1568,
"grad_norm": 1.4032260179519653,
"learning_rate": 9.849104857023455e-05,
"loss": 1.1046,
"step": 1568
},
{
"epoch": 0.16,
"grad_norm": 2.4822256565093994,
"learning_rate": 9.842915805643155e-05,
"loss": 1.0779,
"step": 1600
},
{
"epoch": 0.1632,
"grad_norm": 1.7823238372802734,
"learning_rate": 9.83660439250283e-05,
"loss": 1.0,
"step": 1632
},
{
"epoch": 0.1664,
"grad_norm": 1.3723595142364502,
"learning_rate": 9.830170777067485e-05,
"loss": 1.0838,
"step": 1664
},
{
"epoch": 0.1696,
"grad_norm": 1.4893419742584229,
"learning_rate": 9.823615121889716e-05,
"loss": 1.0734,
"step": 1696
},
{
"epoch": 0.1728,
"grad_norm": 1.2856264114379883,
"learning_rate": 9.816937592605579e-05,
"loss": 1.0497,
"step": 1728
},
{
"epoch": 0.176,
"grad_norm": 1.2529082298278809,
"learning_rate": 9.81013835793043e-05,
"loss": 1.0624,
"step": 1760
},
{
"epoch": 0.1792,
"grad_norm": 1.107729196548462,
"learning_rate": 9.80321758965464e-05,
"loss": 1.0491,
"step": 1792
},
{
"epoch": 0.1824,
"grad_norm": 0.9313052296638489,
"learning_rate": 9.796175462639272e-05,
"loss": 1.1561,
"step": 1824
},
{
"epoch": 0.1856,
"grad_norm": 1.3460460901260376,
"learning_rate": 9.789012154811647e-05,
"loss": 1.0803,
"step": 1856
},
{
"epoch": 0.1888,
"grad_norm": 1.5794706344604492,
"learning_rate": 9.781727847160865e-05,
"loss": 1.0698,
"step": 1888
},
{
"epoch": 0.192,
"grad_norm": 1.2449215650558472,
"learning_rate": 9.774322723733216e-05,
"loss": 1.103,
"step": 1920
},
{
"epoch": 0.1952,
"grad_norm": 1.1799278259277344,
"learning_rate": 9.766796971627543e-05,
"loss": 1.0284,
"step": 1952
},
{
"epoch": 0.1984,
"grad_norm": 1.1231826543807983,
"learning_rate": 9.759150780990507e-05,
"loss": 1.0863,
"step": 1984
},
{
"epoch": 0.2016,
"grad_norm": 0.9852601289749146,
"learning_rate": 9.751384345011787e-05,
"loss": 1.0038,
"step": 2016
},
{
"epoch": 0.2048,
"grad_norm": 1.560398817062378,
"learning_rate": 9.743497859919196e-05,
"loss": 1.0669,
"step": 2048
},
{
"epoch": 0.208,
"grad_norm": 1.0659574270248413,
"learning_rate": 9.735491524973722e-05,
"loss": 1.1653,
"step": 2080
},
{
"epoch": 0.2112,
"grad_norm": 1.3178914785385132,
"learning_rate": 9.727365542464497e-05,
"loss": 1.0349,
"step": 2112
},
{
"epoch": 0.2144,
"grad_norm": 1.0162935256958008,
"learning_rate": 9.719379593129512e-05,
"loss": 1.1365,
"step": 2144
},
{
"epoch": 0.2176,
"grad_norm": 1.0954954624176025,
"learning_rate": 9.711018657323799e-05,
"loss": 1.0986,
"step": 2176
},
{
"epoch": 0.2208,
"grad_norm": 1.1444238424301147,
"learning_rate": 9.702538692289092e-05,
"loss": 1.1172,
"step": 2208
},
{
"epoch": 0.224,
"grad_norm": 1.112743616104126,
"learning_rate": 9.693939912281324e-05,
"loss": 1.0753,
"step": 2240
},
{
"epoch": 0.2272,
"grad_norm": 2.3814074993133545,
"learning_rate": 9.685222534558421e-05,
"loss": 1.0974,
"step": 2272
},
{
"epoch": 0.2304,
"grad_norm": 1.231828212738037,
"learning_rate": 9.676386779374819e-05,
"loss": 1.065,
"step": 2304
},
{
"epoch": 0.2336,
"grad_norm": 1.3037365674972534,
"learning_rate": 9.667432869975897e-05,
"loss": 1.0593,
"step": 2336
},
{
"epoch": 0.2368,
"grad_norm": 0.9929208755493164,
"learning_rate": 9.658361032592323e-05,
"loss": 1.0158,
"step": 2368
},
{
"epoch": 0.24,
"grad_norm": 0.9741840362548828,
"learning_rate": 9.649171496434361e-05,
"loss": 1.1219,
"step": 2400
},
{
"epoch": 0.2432,
"grad_norm": 1.1726874113082886,
"learning_rate": 9.639864493686061e-05,
"loss": 1.1151,
"step": 2432
},
{
"epoch": 0.2464,
"grad_norm": 0.9950255751609802,
"learning_rate": 9.630440259499406e-05,
"loss": 1.0553,
"step": 2464
},
{
"epoch": 0.2496,
"grad_norm": 1.0302757024765015,
"learning_rate": 9.620899031988359e-05,
"loss": 0.9945,
"step": 2496
},
{
"epoch": 0.2528,
"grad_norm": 1.5070362091064453,
"learning_rate": 9.611241052222852e-05,
"loss": 1.0367,
"step": 2528
},
{
"epoch": 0.256,
"grad_norm": 1.3058743476867676,
"learning_rate": 9.601466564222697e-05,
"loss": 1.0692,
"step": 2560
},
{
"epoch": 0.2592,
"grad_norm": 1.237705111503601,
"learning_rate": 9.591575814951419e-05,
"loss": 1.0626,
"step": 2592
},
{
"epoch": 0.2624,
"grad_norm": 2.5512192249298096,
"learning_rate": 9.581569054310016e-05,
"loss": 1.0316,
"step": 2624
},
{
"epoch": 0.2656,
"grad_norm": 1.5230085849761963,
"learning_rate": 9.571446535130641e-05,
"loss": 1.1311,
"step": 2656
},
{
"epoch": 0.2688,
"grad_norm": 0.9144539833068848,
"learning_rate": 9.561208513170223e-05,
"loss": 1.0661,
"step": 2688
},
{
"epoch": 0.272,
"grad_norm": 1.3663253784179688,
"learning_rate": 9.550855247103998e-05,
"loss": 1.0214,
"step": 2720
},
{
"epoch": 0.2752,
"grad_norm": 1.175469994544983,
"learning_rate": 9.540386998518972e-05,
"loss": 1.0807,
"step": 2752
},
{
"epoch": 0.2784,
"grad_norm": 0.9978043437004089,
"learning_rate": 9.529804031907319e-05,
"loss": 0.9998,
"step": 2784
},
{
"epoch": 0.2816,
"grad_norm": 1.9085396528244019,
"learning_rate": 9.519106614659692e-05,
"loss": 1.0589,
"step": 2816
},
{
"epoch": 0.2848,
"grad_norm": 1.1169906854629517,
"learning_rate": 9.50829501705847e-05,
"loss": 1.0892,
"step": 2848
},
{
"epoch": 0.288,
"grad_norm": 1.0185884237289429,
"learning_rate": 9.497369512270926e-05,
"loss": 1.1336,
"step": 2880
},
{
"epoch": 0.2912,
"grad_norm": 1.0060242414474487,
"learning_rate": 9.48633037634233e-05,
"loss": 1.0556,
"step": 2912
},
{
"epoch": 0.2944,
"grad_norm": 1.1675645112991333,
"learning_rate": 9.475177888188969e-05,
"loss": 1.1435,
"step": 2944
},
{
"epoch": 0.2976,
"grad_norm": 1.0665452480316162,
"learning_rate": 9.463912329591105e-05,
"loss": 1.0272,
"step": 2976
},
{
"epoch": 0.3008,
"grad_norm": 0.9090532064437866,
"learning_rate": 9.452533985185852e-05,
"loss": 1.0426,
"step": 3008
},
{
"epoch": 0.304,
"grad_norm": 0.8997248411178589,
"learning_rate": 9.441043142459985e-05,
"loss": 1.0359,
"step": 3040
},
{
"epoch": 0.3072,
"grad_norm": 1.1093074083328247,
"learning_rate": 9.429440091742676e-05,
"loss": 0.9781,
"step": 3072
},
{
"epoch": 0.3104,
"grad_norm": 1.2096401453018188,
"learning_rate": 9.41772512619816e-05,
"loss": 1.0604,
"step": 3104
},
{
"epoch": 0.3136,
"grad_norm": 0.9333838820457458,
"learning_rate": 9.405898541818329e-05,
"loss": 1.0607,
"step": 3136
},
{
"epoch": 0.3168,
"grad_norm": 1.3407750129699707,
"learning_rate": 9.393960637415248e-05,
"loss": 1.0114,
"step": 3168
},
{
"epoch": 0.32,
"grad_norm": 2.159203290939331,
"learning_rate": 9.38191171461361e-05,
"loss": 1.0373,
"step": 3200
},
{
"epoch": 0.3232,
"grad_norm": 1.2726435661315918,
"learning_rate": 9.369752077843114e-05,
"loss": 1.1084,
"step": 3232
},
{
"epoch": 0.3264,
"grad_norm": 1.0973173379898071,
"learning_rate": 9.357482034330775e-05,
"loss": 0.9722,
"step": 3264
},
{
"epoch": 0.3296,
"grad_norm": 1.1249974966049194,
"learning_rate": 9.345101894093154e-05,
"loss": 1.0646,
"step": 3296
},
{
"epoch": 0.3328,
"grad_norm": 1.109535574913025,
"learning_rate": 9.332611969928536e-05,
"loss": 1.0296,
"step": 3328
},
{
"epoch": 0.336,
"grad_norm": 0.9725342392921448,
"learning_rate": 9.32001257740902e-05,
"loss": 1.0389,
"step": 3360
},
{
"epoch": 0.3392,
"grad_norm": 1.2341935634613037,
"learning_rate": 9.307304034872545e-05,
"loss": 1.064,
"step": 3392
},
{
"epoch": 0.3424,
"grad_norm": 0.9506546854972839,
"learning_rate": 9.294486663414851e-05,
"loss": 1.089,
"step": 3424
},
{
"epoch": 0.3456,
"grad_norm": 1.4520303010940552,
"learning_rate": 9.281560786881363e-05,
"loss": 1.0143,
"step": 3456
},
{
"epoch": 0.3488,
"grad_norm": 1.069808006286621,
"learning_rate": 9.268526731859013e-05,
"loss": 1.0328,
"step": 3488
},
{
"epoch": 0.352,
"grad_norm": 0.9005358815193176,
"learning_rate": 9.25538482766798e-05,
"loss": 1.0842,
"step": 3520
},
{
"epoch": 0.3552,
"grad_norm": 0.9558689594268799,
"learning_rate": 9.242135406353378e-05,
"loss": 0.9927,
"step": 3552
},
{
"epoch": 0.3584,
"grad_norm": 1.1002886295318604,
"learning_rate": 9.228778802676863e-05,
"loss": 1.1007,
"step": 3584
},
{
"epoch": 0.3616,
"grad_norm": 1.1814830303192139,
"learning_rate": 9.215315354108174e-05,
"loss": 1.0102,
"step": 3616
},
{
"epoch": 0.3648,
"grad_norm": 1.4370577335357666,
"learning_rate": 9.201745400816606e-05,
"loss": 1.0723,
"step": 3648
},
{
"epoch": 0.368,
"grad_norm": 1.0058218240737915,
"learning_rate": 9.18806928566242e-05,
"loss": 1.1156,
"step": 3680
},
{
"epoch": 0.3712,
"grad_norm": 1.2105575799942017,
"learning_rate": 9.174287354188174e-05,
"loss": 1.0626,
"step": 3712
},
{
"epoch": 0.3744,
"grad_norm": 0.8971224427223206,
"learning_rate": 9.160399954609997e-05,
"loss": 1.1357,
"step": 3744
},
{
"epoch": 0.3776,
"grad_norm": 1.018344521522522,
"learning_rate": 9.146407437808788e-05,
"loss": 1.1171,
"step": 3776
},
{
"epoch": 0.3808,
"grad_norm": 1.098207712173462,
"learning_rate": 9.132310157321354e-05,
"loss": 1.0556,
"step": 3808
},
{
"epoch": 0.384,
"grad_norm": 1.0569736957550049,
"learning_rate": 9.11810846933147e-05,
"loss": 0.9559,
"step": 3840
},
{
"epoch": 0.3872,
"grad_norm": 1.0195281505584717,
"learning_rate": 9.103802732660894e-05,
"loss": 1.0586,
"step": 3872
},
{
"epoch": 0.3904,
"grad_norm": 1.4709314107894897,
"learning_rate": 9.089393308760283e-05,
"loss": 1.0509,
"step": 3904
},
{
"epoch": 0.3936,
"grad_norm": 0.8363422751426697,
"learning_rate": 9.074880561700074e-05,
"loss": 1.0672,
"step": 3936
},
{
"epoch": 0.3968,
"grad_norm": 1.2150477170944214,
"learning_rate": 9.06026485816128e-05,
"loss": 1.0645,
"step": 3968
},
{
"epoch": 0.4,
"grad_norm": 1.0260250568389893,
"learning_rate": 9.045546567426227e-05,
"loss": 1.0307,
"step": 4000
},
{
"epoch": 0.4032,
"grad_norm": 1.3611576557159424,
"learning_rate": 9.03072606136922e-05,
"loss": 1.1087,
"step": 4032
},
{
"epoch": 0.4064,
"grad_norm": 1.0070726871490479,
"learning_rate": 9.015803714447153e-05,
"loss": 1.0799,
"step": 4064
},
{
"epoch": 0.4096,
"grad_norm": 1.0184143781661987,
"learning_rate": 9.000779903690044e-05,
"loss": 1.0447,
"step": 4096
},
{
"epoch": 0.4128,
"grad_norm": 0.8251619935035706,
"learning_rate": 8.985655008691512e-05,
"loss": 1.0781,
"step": 4128
},
{
"epoch": 0.416,
"grad_norm": 1.1904375553131104,
"learning_rate": 8.970429411599177e-05,
"loss": 1.0679,
"step": 4160
},
{
"epoch": 0.4192,
"grad_norm": 1.1670352220535278,
"learning_rate": 8.955103497105021e-05,
"loss": 1.0098,
"step": 4192
},
{
"epoch": 0.4224,
"grad_norm": 1.018236517906189,
"learning_rate": 8.93967765243565e-05,
"loss": 1.0357,
"step": 4224
},
{
"epoch": 0.4256,
"grad_norm": 1.187759518623352,
"learning_rate": 8.924152267342529e-05,
"loss": 1.1212,
"step": 4256
},
{
"epoch": 0.4288,
"grad_norm": 0.9191340208053589,
"learning_rate": 8.908527734092114e-05,
"loss": 0.9963,
"step": 4288
},
{
"epoch": 0.432,
"grad_norm": 1.250663161277771,
"learning_rate": 8.893297291025703e-05,
"loss": 1.1243,
"step": 4320
},
{
"epoch": 0.4352,
"grad_norm": 1.1205859184265137,
"learning_rate": 8.877478715861173e-05,
"loss": 0.9712,
"step": 4352
},
{
"epoch": 0.4384,
"grad_norm": 1.024400234222412,
"learning_rate": 8.86156217179956e-05,
"loss": 1.0184,
"step": 4384
},
{
"epoch": 0.4416,
"grad_norm": 1.0629040002822876,
"learning_rate": 8.845548060990401e-05,
"loss": 1.0391,
"step": 4416
},
{
"epoch": 0.4448,
"grad_norm": 1.0474681854248047,
"learning_rate": 8.829436788048366e-05,
"loss": 1.1721,
"step": 4448
},
{
"epoch": 0.448,
"grad_norm": 1.2960838079452515,
"learning_rate": 8.813228760043037e-05,
"loss": 1.0247,
"step": 4480
},
{
"epoch": 0.4512,
"grad_norm": 1.1051262617111206,
"learning_rate": 8.796924386488624e-05,
"loss": 1.068,
"step": 4512
},
{
"epoch": 0.4544,
"grad_norm": 0.9894328713417053,
"learning_rate": 8.780524079333615e-05,
"loss": 0.9805,
"step": 4544
},
{
"epoch": 0.4576,
"grad_norm": 1.0095499753952026,
"learning_rate": 8.764028252950365e-05,
"loss": 0.9994,
"step": 4576
},
{
"epoch": 0.4608,
"grad_norm": 1.0299321413040161,
"learning_rate": 8.74743732412464e-05,
"loss": 1.0258,
"step": 4608
},
{
"epoch": 0.464,
"grad_norm": 1.11245858669281,
"learning_rate": 8.73075171204507e-05,
"loss": 1.0388,
"step": 4640
},
{
"epoch": 0.4672,
"grad_norm": 1.2084026336669922,
"learning_rate": 8.713971838292569e-05,
"loss": 1.1596,
"step": 4672
},
{
"epoch": 0.4704,
"grad_norm": 1.1535048484802246,
"learning_rate": 8.697098126829675e-05,
"loss": 1.0674,
"step": 4704
},
{
"epoch": 0.4736,
"grad_norm": 1.0976839065551758,
"learning_rate": 8.680131003989842e-05,
"loss": 1.1089,
"step": 4736
},
{
"epoch": 0.4768,
"grad_norm": 1.1108759641647339,
"learning_rate": 8.663070898466674e-05,
"loss": 1.0047,
"step": 4768
},
{
"epoch": 0.48,
"grad_norm": 0.9953986406326294,
"learning_rate": 8.645918241303084e-05,
"loss": 1.0991,
"step": 4800
},
{
"epoch": 0.4832,
"grad_norm": 1.0783305168151855,
"learning_rate": 8.628673465880404e-05,
"loss": 1.0449,
"step": 4832
},
{
"epoch": 0.4864,
"grad_norm": 1.0670068264007568,
"learning_rate": 8.611337007907448e-05,
"loss": 1.0002,
"step": 4864
},
{
"epoch": 0.4896,
"grad_norm": 1.4406965970993042,
"learning_rate": 8.59390930540948e-05,
"loss": 1.0825,
"step": 4896
},
{
"epoch": 0.4928,
"grad_norm": 2.000100612640381,
"learning_rate": 8.576390798717174e-05,
"loss": 1.0658,
"step": 4928
},
{
"epoch": 0.496,
"grad_norm": 1.1239198446273804,
"learning_rate": 8.558781930455464e-05,
"loss": 1.0066,
"step": 4960
},
{
"epoch": 0.4992,
"grad_norm": 0.965144157409668,
"learning_rate": 8.54108314553238e-05,
"loss": 1.0965,
"step": 4992
},
{
"epoch": 0.5024,
"grad_norm": 1.0297799110412598,
"learning_rate": 8.523294891127794e-05,
"loss": 1.0257,
"step": 5024
},
{
"epoch": 0.5056,
"grad_norm": 1.1478264331817627,
"learning_rate": 8.505417616682126e-05,
"loss": 1.0601,
"step": 5056
},
{
"epoch": 0.5088,
"grad_norm": 1.0132007598876953,
"learning_rate": 8.487451773884987e-05,
"loss": 1.0643,
"step": 5088
},
{
"epoch": 0.512,
"grad_norm": 1.5010863542556763,
"learning_rate": 8.469397816663773e-05,
"loss": 1.0577,
"step": 5120
},
{
"epoch": 0.5152,
"grad_norm": 1.0446892976760864,
"learning_rate": 8.451256201172186e-05,
"loss": 1.0305,
"step": 5152
},
{
"epoch": 0.5184,
"grad_norm": 1.0374213457107544,
"learning_rate": 8.433027385778716e-05,
"loss": 1.0254,
"step": 5184
},
{
"epoch": 0.5216,
"grad_norm": 0.958988606929779,
"learning_rate": 8.414711831055056e-05,
"loss": 1.0157,
"step": 5216
},
{
"epoch": 0.5248,
"grad_norm": 1.049494981765747,
"learning_rate": 8.396309999764467e-05,
"loss": 1.0241,
"step": 5248
},
{
"epoch": 0.528,
"grad_norm": 0.9103986620903015,
"learning_rate": 8.377822356850084e-05,
"loss": 1.0658,
"step": 5280
},
{
"epoch": 0.5312,
"grad_norm": 1.6454554796218872,
"learning_rate": 8.359249369423177e-05,
"loss": 1.0543,
"step": 5312
},
{
"epoch": 0.5344,
"grad_norm": 1.1632812023162842,
"learning_rate": 8.34059150675133e-05,
"loss": 1.0576,
"step": 5344
},
{
"epoch": 0.5376,
"grad_norm": 1.066264033317566,
"learning_rate": 8.321849240246608e-05,
"loss": 1.0488,
"step": 5376
},
{
"epoch": 0.5408,
"grad_norm": 0.9884083867073059,
"learning_rate": 8.303023043453624e-05,
"loss": 1.054,
"step": 5408
},
{
"epoch": 0.544,
"grad_norm": 1.1581878662109375,
"learning_rate": 8.284113392037593e-05,
"loss": 1.0847,
"step": 5440
},
{
"epoch": 0.5472,
"grad_norm": 1.0645771026611328,
"learning_rate": 8.265120763772303e-05,
"loss": 0.9862,
"step": 5472
},
{
"epoch": 0.5504,
"grad_norm": 1.2600454092025757,
"learning_rate": 8.246045638528047e-05,
"loss": 1.0295,
"step": 5504
},
{
"epoch": 0.5536,
"grad_norm": 1.2756901979446411,
"learning_rate": 8.226888498259496e-05,
"loss": 0.9753,
"step": 5536
},
{
"epoch": 0.5568,
"grad_norm": 1.0469154119491577,
"learning_rate": 8.207649826993522e-05,
"loss": 1.0993,
"step": 5568
},
{
"epoch": 0.56,
"grad_norm": 1.1633126735687256,
"learning_rate": 8.188330110816976e-05,
"loss": 0.9892,
"step": 5600
},
{
"epoch": 0.5632,
"grad_norm": 1.7112101316452026,
"learning_rate": 8.168929837864395e-05,
"loss": 0.9913,
"step": 5632
},
{
"epoch": 0.5664,
"grad_norm": 1.0041791200637817,
"learning_rate": 8.149449498305674e-05,
"loss": 1.0494,
"step": 5664
},
{
"epoch": 0.5696,
"grad_norm": 1.1538423299789429,
"learning_rate": 8.12988958433369e-05,
"loss": 1.0383,
"step": 5696
},
{
"epoch": 0.5728,
"grad_norm": 0.9828271865844727,
"learning_rate": 8.110250590151848e-05,
"loss": 1.1132,
"step": 5728
},
{
"epoch": 0.576,
"grad_norm": 1.243087649345398,
"learning_rate": 8.090533011961609e-05,
"loss": 1.008,
"step": 5760
},
{
"epoch": 0.5792,
"grad_norm": 1.0514239072799683,
"learning_rate": 8.070737347949947e-05,
"loss": 1.0286,
"step": 5792
},
{
"epoch": 0.5824,
"grad_norm": 1.0970929861068726,
"learning_rate": 8.050864098276762e-05,
"loss": 1.1212,
"step": 5824
},
{
"epoch": 0.5856,
"grad_norm": 1.0040539503097534,
"learning_rate": 8.030913765062245e-05,
"loss": 1.0395,
"step": 5856
},
{
"epoch": 0.5888,
"grad_norm": 0.8210061192512512,
"learning_rate": 8.010886852374191e-05,
"loss": 1.1159,
"step": 5888
},
{
"epoch": 0.592,
"grad_norm": 1.0136836767196655,
"learning_rate": 7.990783866215259e-05,
"loss": 1.0392,
"step": 5920
},
{
"epoch": 0.5952,
"grad_norm": 1.1107640266418457,
"learning_rate": 7.970605314510194e-05,
"loss": 1.0279,
"step": 5952
},
{
"epoch": 0.5984,
"grad_norm": 0.9535327553749084,
"learning_rate": 7.950351707092987e-05,
"loss": 1.0608,
"step": 5984
},
{
"epoch": 0.6016,
"grad_norm": 1.3050202131271362,
"learning_rate": 7.930023555693999e-05,
"loss": 1.0714,
"step": 6016
},
{
"epoch": 0.6048,
"grad_norm": 1.0925366878509521,
"learning_rate": 7.909621373927029e-05,
"loss": 0.9707,
"step": 6048
},
{
"epoch": 0.608,
"grad_norm": 0.9475853443145752,
"learning_rate": 7.88914567727634e-05,
"loss": 1.0056,
"step": 6080
},
{
"epoch": 0.6112,
"grad_norm": 1.2536673545837402,
"learning_rate": 7.868596983083623e-05,
"loss": 1.0983,
"step": 6112
},
{
"epoch": 0.6144,
"grad_norm": 1.1593080759048462,
"learning_rate": 7.847975810534943e-05,
"loss": 1.0214,
"step": 6144
},
{
"epoch": 0.6176,
"grad_norm": 1.4903924465179443,
"learning_rate": 7.82728268064761e-05,
"loss": 1.0825,
"step": 6176
},
{
"epoch": 0.6208,
"grad_norm": 1.31364905834198,
"learning_rate": 7.80651811625702e-05,
"loss": 1.0184,
"step": 6208
},
{
"epoch": 0.624,
"grad_norm": 1.1020359992980957,
"learning_rate": 7.785682642003437e-05,
"loss": 0.9785,
"step": 6240
},
{
"epoch": 0.6272,
"grad_norm": 1.680654525756836,
"learning_rate": 7.764776784318751e-05,
"loss": 1.0493,
"step": 6272
},
{
"epoch": 0.6304,
"grad_norm": 0.8548070192337036,
"learning_rate": 7.743801071413161e-05,
"loss": 1.0325,
"step": 6304
},
{
"epoch": 0.6336,
"grad_norm": 1.3193022012710571,
"learning_rate": 7.722756033261844e-05,
"loss": 1.0861,
"step": 6336
},
{
"epoch": 0.6368,
"grad_norm": 1.1262884140014648,
"learning_rate": 7.701642201591555e-05,
"loss": 0.9799,
"step": 6368
},
{
"epoch": 0.64,
"grad_norm": 1.0190273523330688,
"learning_rate": 7.680460109867194e-05,
"loss": 0.9806,
"step": 6400
},
{
"epoch": 0.6432,
"grad_norm": 0.9623986482620239,
"learning_rate": 7.659210293278334e-05,
"loss": 1.0146,
"step": 6432
},
{
"epoch": 0.6464,
"grad_norm": 0.8106020092964172,
"learning_rate": 7.637893288725688e-05,
"loss": 1.1549,
"step": 6464
},
{
"epoch": 0.6496,
"grad_norm": 1.0692909955978394,
"learning_rate": 7.616509634807549e-05,
"loss": 1.0515,
"step": 6496
},
{
"epoch": 0.6528,
"grad_norm": 0.7676146626472473,
"learning_rate": 7.595059871806187e-05,
"loss": 1.0496,
"step": 6528
},
{
"epoch": 0.656,
"grad_norm": 1.4028490781784058,
"learning_rate": 7.574217882816324e-05,
"loss": 1.1564,
"step": 6560
},
{
"epoch": 0.6592,
"grad_norm": 2.0384438037872314,
"learning_rate": 7.552639552903132e-05,
"loss": 0.9668,
"step": 6592
},
{
"epoch": 0.6624,
"grad_norm": 1.113044261932373,
"learning_rate": 7.53099672765677e-05,
"loss": 1.0345,
"step": 6624
},
{
"epoch": 0.6656,
"grad_norm": 1.3547977209091187,
"learning_rate": 7.509289953907758e-05,
"loss": 1.0719,
"step": 6656
},
{
"epoch": 0.6688,
"grad_norm": 0.9287874102592468,
"learning_rate": 7.487519780102354e-05,
"loss": 1.0301,
"step": 6688
},
{
"epoch": 0.672,
"grad_norm": 1.3750686645507812,
"learning_rate": 7.46568675628869e-05,
"loss": 1.0542,
"step": 6720
},
{
"epoch": 0.6752,
"grad_norm": 0.5963271260261536,
"learning_rate": 7.443791434102868e-05,
"loss": 0.9945,
"step": 6752
},
{
"epoch": 0.6784,
"grad_norm": 1.117193341255188,
"learning_rate": 7.421834366755039e-05,
"loss": 1.0214,
"step": 6784
},
{
"epoch": 0.6816,
"grad_norm": 1.096929907798767,
"learning_rate": 7.399816109015407e-05,
"loss": 1.0439,
"step": 6816
},
{
"epoch": 0.6848,
"grad_norm": 1.0610090494155884,
"learning_rate": 7.377737217200226e-05,
"loss": 1.041,
"step": 6848
},
{
"epoch": 0.688,
"grad_norm": 0.9771848320960999,
"learning_rate": 7.355598249157734e-05,
"loss": 1.1224,
"step": 6880
},
{
"epoch": 0.6912,
"grad_norm": 1.0380698442459106,
"learning_rate": 7.333399764254068e-05,
"loss": 1.0475,
"step": 6912
},
{
"epoch": 0.6944,
"grad_norm": 1.1135938167572021,
"learning_rate": 7.311142323359121e-05,
"loss": 0.9665,
"step": 6944
},
{
"epoch": 0.6976,
"grad_norm": 0.9427506327629089,
"learning_rate": 7.288826488832384e-05,
"loss": 1.0845,
"step": 6976
},
{
"epoch": 0.7008,
"grad_norm": 1.020609736442566,
"learning_rate": 7.266452824508719e-05,
"loss": 1.0806,
"step": 7008
},
{
"epoch": 0.704,
"grad_norm": 1.3327020406723022,
"learning_rate": 7.244021895684131e-05,
"loss": 1.0456,
"step": 7040
},
{
"epoch": 0.7072,
"grad_norm": 0.9490824937820435,
"learning_rate": 7.221534269101474e-05,
"loss": 1.0546,
"step": 7072
},
{
"epoch": 0.7104,
"grad_norm": 1.043341875076294,
"learning_rate": 7.198990512936135e-05,
"loss": 0.9643,
"step": 7104
},
{
"epoch": 0.7136,
"grad_norm": 1.0628856420516968,
"learning_rate": 7.17639119678168e-05,
"loss": 1.0433,
"step": 7136
},
{
"epoch": 0.7168,
"grad_norm": 0.8244098424911499,
"learning_rate": 7.153736891635463e-05,
"loss": 1.0359,
"step": 7168
},
{
"epoch": 0.72,
"grad_norm": 1.1554003953933716,
"learning_rate": 7.131028169884194e-05,
"loss": 1.0216,
"step": 7200
},
{
"epoch": 0.7232,
"grad_norm": 1.1582995653152466,
"learning_rate": 7.108265605289481e-05,
"loss": 0.9845,
"step": 7232
},
{
"epoch": 0.7264,
"grad_norm": 1.1655360460281372,
"learning_rate": 7.085449772973333e-05,
"loss": 1.0771,
"step": 7264
},
{
"epoch": 0.7296,
"grad_norm": 1.28196382522583,
"learning_rate": 7.062581249403627e-05,
"loss": 1.0186,
"step": 7296
},
{
"epoch": 0.7328,
"grad_norm": 1.149167537689209,
"learning_rate": 7.039660612379546e-05,
"loss": 0.9905,
"step": 7328
},
{
"epoch": 0.736,
"grad_norm": 1.0396078824996948,
"learning_rate": 7.016688441016979e-05,
"loss": 1.0196,
"step": 7360
},
{
"epoch": 0.7392,
"grad_norm": 0.8532673716545105,
"learning_rate": 6.993665315733889e-05,
"loss": 1.0197,
"step": 7392
},
{
"epoch": 0.7424,
"grad_norm": 1.03330659866333,
"learning_rate": 6.970591818235641e-05,
"loss": 1.0163,
"step": 7424
},
{
"epoch": 0.7456,
"grad_norm": 1.5266470909118652,
"learning_rate": 6.947468531500321e-05,
"loss": 1.0247,
"step": 7456
},
{
"epoch": 0.7488,
"grad_norm": 1.127951979637146,
"learning_rate": 6.924296039763987e-05,
"loss": 0.9851,
"step": 7488
},
{
"epoch": 0.752,
"grad_norm": 1.0132697820663452,
"learning_rate": 6.901074928505928e-05,
"loss": 1.0015,
"step": 7520
},
{
"epoch": 0.7552,
"grad_norm": 1.034342646598816,
"learning_rate": 6.877805784433852e-05,
"loss": 0.978,
"step": 7552
},
{
"epoch": 0.7584,
"grad_norm": 0.9696159958839417,
"learning_rate": 6.854489195469069e-05,
"loss": 1.129,
"step": 7584
},
{
"epoch": 0.7616,
"grad_norm": 1.056174874305725,
"learning_rate": 6.831125750731646e-05,
"loss": 1.0418,
"step": 7616
},
{
"epoch": 0.7648,
"grad_norm": 0.9070044755935669,
"learning_rate": 6.80771604052551e-05,
"loss": 1.0073,
"step": 7648
},
{
"epoch": 0.768,
"grad_norm": 1.1860136985778809,
"learning_rate": 6.784260656323533e-05,
"loss": 1.0599,
"step": 7680
},
{
"epoch": 0.7712,
"grad_norm": 1.0756847858428955,
"learning_rate": 6.760760190752604e-05,
"loss": 1.0392,
"step": 7712
},
{
"epoch": 0.7744,
"grad_norm": 1.247762680053711,
"learning_rate": 6.737215237578631e-05,
"loss": 1.0265,
"step": 7744
},
{
"epoch": 0.7776,
"grad_norm": 1.0265753269195557,
"learning_rate": 6.71362639169156e-05,
"loss": 1.031,
"step": 7776
},
{
"epoch": 0.7808,
"grad_norm": 1.0263795852661133,
"learning_rate": 6.689994249090333e-05,
"loss": 0.9527,
"step": 7808
},
{
"epoch": 0.784,
"grad_norm": 1.2893991470336914,
"learning_rate": 6.666319406867833e-05,
"loss": 1.1626,
"step": 7840
},
{
"epoch": 0.7872,
"grad_norm": 0.958138644695282,
"learning_rate": 6.642602463195799e-05,
"loss": 1.1133,
"step": 7872
},
{
"epoch": 0.7904,
"grad_norm": 1.257802128791809,
"learning_rate": 6.618844017309708e-05,
"loss": 1.0102,
"step": 7904
},
{
"epoch": 0.7936,
"grad_norm": 1.1419870853424072,
"learning_rate": 6.59504466949364e-05,
"loss": 1.0997,
"step": 7936
},
{
"epoch": 0.7968,
"grad_norm": 0.9523638486862183,
"learning_rate": 6.571205021065108e-05,
"loss": 1.0273,
"step": 7968
},
{
"epoch": 0.8,
"grad_norm": 1.1219632625579834,
"learning_rate": 6.547325674359865e-05,
"loss": 1.123,
"step": 8000
},
{
"epoch": 0.8032,
"grad_norm": 1.3210878372192383,
"learning_rate": 6.523407232716684e-05,
"loss": 0.9976,
"step": 8032
},
{
"epoch": 0.8064,
"grad_norm": 1.176743984222412,
"learning_rate": 6.499450300462121e-05,
"loss": 1.0448,
"step": 8064
},
{
"epoch": 0.8096,
"grad_norm": 1.2411494255065918,
"learning_rate": 6.475455482895238e-05,
"loss": 1.0001,
"step": 8096
},
{
"epoch": 0.8128,
"grad_norm": 1.0539944171905518,
"learning_rate": 6.451423386272312e-05,
"loss": 1.122,
"step": 8128
},
{
"epoch": 0.816,
"grad_norm": 2.260613203048706,
"learning_rate": 6.427354617791519e-05,
"loss": 1.005,
"step": 8160
},
{
"epoch": 0.8192,
"grad_norm": 1.2137510776519775,
"learning_rate": 6.403249785577589e-05,
"loss": 0.9567,
"step": 8192
},
{
"epoch": 0.8224,
"grad_norm": 1.1636831760406494,
"learning_rate": 6.379109498666445e-05,
"loss": 1.0428,
"step": 8224
},
{
"epoch": 0.8256,
"grad_norm": 1.1331391334533691,
"learning_rate": 6.354934366989812e-05,
"loss": 1.0609,
"step": 8256
},
{
"epoch": 0.8288,
"grad_norm": 1.5374737977981567,
"learning_rate": 6.330725001359809e-05,
"loss": 1.0728,
"step": 8288
},
{
"epoch": 0.832,
"grad_norm": 1.287787675857544,
"learning_rate": 6.306482013453515e-05,
"loss": 1.0416,
"step": 8320
},
{
"epoch": 0.8352,
"grad_norm": 1.130149006843567,
"learning_rate": 6.28220601579751e-05,
"loss": 1.0513,
"step": 8352
},
{
"epoch": 0.8384,
"grad_norm": 0.9294027090072632,
"learning_rate": 6.257897621752405e-05,
"loss": 1.0551,
"step": 8384
},
{
"epoch": 0.8416,
"grad_norm": 0.8640485405921936,
"learning_rate": 6.233557445497345e-05,
"loss": 1.0518,
"step": 8416
},
{
"epoch": 0.8448,
"grad_norm": 1.1084208488464355,
"learning_rate": 6.209186102014486e-05,
"loss": 1.0359,
"step": 8448
},
{
"epoch": 0.848,
"grad_norm": 0.8203976154327393,
"learning_rate": 6.18478420707346e-05,
"loss": 0.9709,
"step": 8480
},
{
"epoch": 0.8512,
"grad_norm": 1.507534384727478,
"learning_rate": 6.160352377215816e-05,
"loss": 0.9479,
"step": 8512
},
{
"epoch": 0.8544,
"grad_norm": 0.8405012488365173,
"learning_rate": 6.135891229739444e-05,
"loss": 1.025,
"step": 8544
},
{
"epoch": 0.8576,
"grad_norm": 0.9983368515968323,
"learning_rate": 6.111401382682972e-05,
"loss": 1.1023,
"step": 8576
},
{
"epoch": 0.8608,
"grad_norm": 1.079447865486145,
"learning_rate": 6.086883454810162e-05,
"loss": 0.9684,
"step": 8608
},
{
"epoch": 0.864,
"grad_norm": 0.963784396648407,
"learning_rate": 6.06310551852323e-05,
"loss": 1.0703,
"step": 8640
},
{
"epoch": 0.8672,
"grad_norm": 1.44817316532135,
"learning_rate": 6.0385341175240205e-05,
"loss": 1.0276,
"step": 8672
},
{
"epoch": 0.8704,
"grad_norm": 1.0072044134140015,
"learning_rate": 6.0139364767825626e-05,
"loss": 1.0744,
"step": 8704
},
{
"epoch": 0.8736,
"grad_norm": 1.328588604927063,
"learning_rate": 5.9893132177861454e-05,
"loss": 1.0823,
"step": 8736
},
{
"epoch": 0.8768,
"grad_norm": 1.323585867881775,
"learning_rate": 5.964664962669333e-05,
"loss": 1.0011,
"step": 8768
},
{
"epoch": 0.88,
"grad_norm": 1.4633543491363525,
"learning_rate": 5.939992334198242e-05,
"loss": 0.9919,
"step": 8800
},
{
"epoch": 0.8832,
"grad_norm": 1.0282506942749023,
"learning_rate": 5.9152959557548117e-05,
"loss": 1.0215,
"step": 8832
},
{
"epoch": 0.8864,
"grad_norm": 0.8649700284004211,
"learning_rate": 5.89057645132105e-05,
"loss": 1.0628,
"step": 8864
},
{
"epoch": 0.8896,
"grad_norm": 0.9102625846862793,
"learning_rate": 5.865834445463273e-05,
"loss": 0.9597,
"step": 8896
},
{
"epoch": 0.8928,
"grad_norm": 1.0294193029403687,
"learning_rate": 5.841070563316315e-05,
"loss": 1.0335,
"step": 8928
},
{
"epoch": 0.896,
"grad_norm": 1.122887372970581,
"learning_rate": 5.8162854305677425e-05,
"loss": 1.0743,
"step": 8960
},
{
"epoch": 0.8992,
"grad_norm": 1.419608235359192,
"learning_rate": 5.791479673442044e-05,
"loss": 1.0136,
"step": 8992
},
{
"epoch": 0.9024,
"grad_norm": 1.0360965728759766,
"learning_rate": 5.7666539186848036e-05,
"loss": 1.0314,
"step": 9024
},
{
"epoch": 0.9056,
"grad_norm": 1.2409007549285889,
"learning_rate": 5.74180879354687e-05,
"loss": 0.903,
"step": 9056
},
{
"epoch": 0.9088,
"grad_norm": 1.0799171924591064,
"learning_rate": 5.716944925768505e-05,
"loss": 1.0727,
"step": 9088
},
{
"epoch": 0.912,
"grad_norm": 1.0068849325180054,
"learning_rate": 5.6920629435635256e-05,
"loss": 0.9064,
"step": 9120
},
{
"epoch": 0.9152,
"grad_norm": 0.9761477708816528,
"learning_rate": 5.6671634756034295e-05,
"loss": 0.9928,
"step": 9152
},
{
"epoch": 0.9184,
"grad_norm": 1.3264461755752563,
"learning_rate": 5.642247151001515e-05,
"loss": 1.0678,
"step": 9184
},
{
"epoch": 0.9216,
"grad_norm": 1.6155526638031006,
"learning_rate": 5.617314599296977e-05,
"loss": 1.0057,
"step": 9216
},
{
"epoch": 0.9248,
"grad_norm": 0.985884428024292,
"learning_rate": 5.592366450439012e-05,
"loss": 1.0783,
"step": 9248
},
{
"epoch": 0.928,
"grad_norm": 1.1194316148757935,
"learning_rate": 5.567403334770891e-05,
"loss": 1.086,
"step": 9280
},
{
"epoch": 0.9312,
"grad_norm": 0.9581426978111267,
"learning_rate": 5.542425883014043e-05,
"loss": 1.0819,
"step": 9312
},
{
"epoch": 0.9344,
"grad_norm": 1.0260018110275269,
"learning_rate": 5.517434726252113e-05,
"loss": 1.0206,
"step": 9344
},
{
"epoch": 0.9376,
"grad_norm": 1.3467062711715698,
"learning_rate": 5.4924304959150175e-05,
"loss": 1.0682,
"step": 9376
},
{
"epoch": 0.9408,
"grad_norm": 1.030444622039795,
"learning_rate": 5.467413823762993e-05,
"loss": 1.0894,
"step": 9408
},
{
"epoch": 0.944,
"grad_norm": 1.1066439151763916,
"learning_rate": 5.4423853418706327e-05,
"loss": 0.938,
"step": 9440
},
{
"epoch": 0.9472,
"grad_norm": 1.088860034942627,
"learning_rate": 5.417345682610914e-05,
"loss": 1.0293,
"step": 9472
},
{
"epoch": 0.9504,
"grad_norm": 1.4524608850479126,
"learning_rate": 5.392295478639225e-05,
"loss": 1.0259,
"step": 9504
},
{
"epoch": 0.9536,
"grad_norm": 1.0502616167068481,
"learning_rate": 5.367235362877378e-05,
"loss": 0.9685,
"step": 9536
},
{
"epoch": 0.9568,
"grad_norm": 1.1287665367126465,
"learning_rate": 5.3421659684976197e-05,
"loss": 1.0295,
"step": 9568
},
{
"epoch": 0.96,
"grad_norm": 1.4596409797668457,
"learning_rate": 5.317087928906627e-05,
"loss": 1.0235,
"step": 9600
},
{
"epoch": 0.9632,
"grad_norm": 1.3627421855926514,
"learning_rate": 5.29200187772951e-05,
"loss": 1.126,
"step": 9632
},
{
"epoch": 0.9664,
"grad_norm": 1.2144567966461182,
"learning_rate": 5.266908448793803e-05,
"loss": 0.9882,
"step": 9664
},
{
"epoch": 0.9696,
"grad_norm": 1.453833818435669,
"learning_rate": 5.2418082761134445e-05,
"loss": 1.0644,
"step": 9696
},
{
"epoch": 0.9728,
"grad_norm": 1.1099966764450073,
"learning_rate": 5.216701993872762e-05,
"loss": 0.974,
"step": 9728
},
{
"epoch": 0.976,
"grad_norm": 0.8567425012588501,
"learning_rate": 5.1915902364104506e-05,
"loss": 1.0689,
"step": 9760
},
{
"epoch": 0.9792,
"grad_norm": 1.1577990055084229,
"learning_rate": 5.166473638203539e-05,
"loss": 1.0094,
"step": 9792
},
{
"epoch": 0.9824,
"grad_norm": 0.8881478905677795,
"learning_rate": 5.141352833851367e-05,
"loss": 1.0945,
"step": 9824
},
{
"epoch": 0.9856,
"grad_norm": 0.8964444994926453,
"learning_rate": 5.116228458059543e-05,
"loss": 1.0251,
"step": 9856
},
{
"epoch": 0.9888,
"grad_norm": 1.2837964296340942,
"learning_rate": 5.0911011456239157e-05,
"loss": 1.1041,
"step": 9888
},
{
"epoch": 0.992,
"grad_norm": 1.0828759670257568,
"learning_rate": 5.065971531414528e-05,
"loss": 1.0765,
"step": 9920
},
{
"epoch": 0.9952,
"grad_norm": 1.0157177448272705,
"learning_rate": 5.0408402503595845e-05,
"loss": 1.0109,
"step": 9952
},
{
"epoch": 0.9984,
"grad_norm": 1.128143310546875,
"learning_rate": 5.0157079374293983e-05,
"loss": 1.0521,
"step": 9984
},
{
"epoch": 1.0016,
"grad_norm": 1.0766175985336304,
"learning_rate": 4.990575227620359e-05,
"loss": 1.0581,
"step": 10016
},
{
"epoch": 1.0048,
"grad_norm": 1.3999875783920288,
"learning_rate": 4.965442755938884e-05,
"loss": 0.935,
"step": 10048
},
{
"epoch": 1.008,
"grad_norm": 1.262337565422058,
"learning_rate": 4.9403111573853686e-05,
"loss": 0.9973,
"step": 10080
},
{
"epoch": 1.0112,
"grad_norm": 1.070391297340393,
"learning_rate": 4.9151810669381556e-05,
"loss": 1.0556,
"step": 10112
},
{
"epoch": 1.0144,
"grad_norm": 1.2712632417678833,
"learning_rate": 4.890053119537475e-05,
"loss": 0.9714,
"step": 10144
},
{
"epoch": 1.0176,
"grad_norm": 1.2587823867797852,
"learning_rate": 4.864927950069416e-05,
"loss": 1.0238,
"step": 10176
},
{
"epoch": 1.0208,
"grad_norm": 1.1266289949417114,
"learning_rate": 4.8398061933498816e-05,
"loss": 1.0768,
"step": 10208
},
{
"epoch": 1.024,
"grad_norm": 1.0433228015899658,
"learning_rate": 4.81468848410854e-05,
"loss": 1.0194,
"step": 10240
},
{
"epoch": 1.0272,
"grad_norm": 0.9119483828544617,
"learning_rate": 4.7895754569728066e-05,
"loss": 0.9746,
"step": 10272
},
{
"epoch": 1.0304,
"grad_norm": 0.9693041443824768,
"learning_rate": 4.7644677464517874e-05,
"loss": 1.0196,
"step": 10304
},
{
"epoch": 1.0336,
"grad_norm": 1.5135239362716675,
"learning_rate": 4.739365986920265e-05,
"loss": 0.9915,
"step": 10336
},
{
"epoch": 1.0368,
"grad_norm": 1.232332468032837,
"learning_rate": 4.714270812602657e-05,
"loss": 1.0194,
"step": 10368
},
{
"epoch": 1.04,
"grad_norm": 1.0907468795776367,
"learning_rate": 4.6891828575570055e-05,
"loss": 1.0179,
"step": 10400
},
{
"epoch": 1.0432,
"grad_norm": 1.0710036754608154,
"learning_rate": 4.664102755658948e-05,
"loss": 0.9436,
"step": 10432
},
{
"epoch": 1.0464,
"grad_norm": 1.119939923286438,
"learning_rate": 4.639031140585697e-05,
"loss": 1.1025,
"step": 10464
},
{
"epoch": 1.0496,
"grad_norm": 1.2719630002975464,
"learning_rate": 4.613968645800044e-05,
"loss": 1.066,
"step": 10496
},
{
"epoch": 1.0528,
"grad_norm": 1.1809210777282715,
"learning_rate": 4.5889159045343404e-05,
"loss": 1.0601,
"step": 10528
},
{
"epoch": 1.056,
"grad_norm": 1.0106052160263062,
"learning_rate": 4.563873549774506e-05,
"loss": 0.9535,
"step": 10560
},
{
"epoch": 1.0592,
"grad_norm": 1.2337009906768799,
"learning_rate": 4.538842214244035e-05,
"loss": 0.9777,
"step": 10592
},
{
"epoch": 1.0624,
"grad_norm": 1.092423915863037,
"learning_rate": 4.513822530388003e-05,
"loss": 1.0026,
"step": 10624
},
{
"epoch": 1.0656,
"grad_norm": 1.0055973529815674,
"learning_rate": 4.4888151303571026e-05,
"loss": 1.02,
"step": 10656
},
{
"epoch": 1.0688,
"grad_norm": 1.6361074447631836,
"learning_rate": 4.463820645991651e-05,
"loss": 1.0177,
"step": 10688
},
{
"epoch": 1.072,
"grad_norm": 1.4629695415496826,
"learning_rate": 4.43883970880564e-05,
"loss": 1.0176,
"step": 10720
},
{
"epoch": 1.0752,
"grad_norm": 1.0951917171478271,
"learning_rate": 4.4138729499707844e-05,
"loss": 0.9829,
"step": 10752
},
{
"epoch": 1.0784,
"grad_norm": 1.361081600189209,
"learning_rate": 4.3889210003005524e-05,
"loss": 1.0409,
"step": 10784
},
{
"epoch": 1.0816,
"grad_norm": 1.0057966709136963,
"learning_rate": 4.363984490234256e-05,
"loss": 1.0299,
"step": 10816
},
{
"epoch": 1.0848,
"grad_norm": 1.2500144243240356,
"learning_rate": 4.339064049821097e-05,
"loss": 0.9951,
"step": 10848
},
{
"epoch": 1.088,
"grad_norm": 1.200129508972168,
"learning_rate": 4.314160308704268e-05,
"loss": 1.0495,
"step": 10880
},
{
"epoch": 1.0912,
"grad_norm": 1.3747400045394897,
"learning_rate": 4.289273896105027e-05,
"loss": 1.0671,
"step": 10912
},
{
"epoch": 1.0944,
"grad_norm": 1.1098862886428833,
"learning_rate": 4.264405440806813e-05,
"loss": 0.9685,
"step": 10944
},
{
"epoch": 1.0976,
"grad_norm": 1.1737552881240845,
"learning_rate": 4.239555571139353e-05,
"loss": 0.9821,
"step": 10976
},
{
"epoch": 1.1008,
"grad_norm": 1.214131474494934,
"learning_rate": 4.2147249149627824e-05,
"loss": 0.9924,
"step": 11008
},
{
"epoch": 1.104,
"grad_norm": 1.2195714712142944,
"learning_rate": 4.1899140996517934e-05,
"loss": 0.9751,
"step": 11040
},
{
"epoch": 1.1072,
"grad_norm": 2.5252718925476074,
"learning_rate": 4.165123752079768e-05,
"loss": 0.9862,
"step": 11072
},
{
"epoch": 1.1104,
"grad_norm": 1.2083877325057983,
"learning_rate": 4.140354498602952e-05,
"loss": 0.9756,
"step": 11104
},
{
"epoch": 1.1136,
"grad_norm": 1.0610405206680298,
"learning_rate": 4.115606965044628e-05,
"loss": 0.949,
"step": 11136
},
{
"epoch": 1.1168,
"grad_norm": 1.208709716796875,
"learning_rate": 4.090881776679293e-05,
"loss": 1.0923,
"step": 11168
},
{
"epoch": 1.12,
"grad_norm": 1.0672675371170044,
"learning_rate": 4.0669511486535804e-05,
"loss": 1.1012,
"step": 11200
},
{
"epoch": 1.1232,
"grad_norm": 1.0231329202651978,
"learning_rate": 4.04227177746873e-05,
"loss": 0.9895,
"step": 11232
},
{
"epoch": 1.1264,
"grad_norm": 1.1311768293380737,
"learning_rate": 4.0176166043735534e-05,
"loss": 0.979,
"step": 11264
},
{
"epoch": 1.1296,
"grad_norm": 1.1029884815216064,
"learning_rate": 3.992986252308955e-05,
"loss": 1.0535,
"step": 11296
},
{
"epoch": 1.1328,
"grad_norm": 1.1183357238769531,
"learning_rate": 3.9683813435887156e-05,
"loss": 1.0938,
"step": 11328
},
{
"epoch": 1.1360000000000001,
"grad_norm": 1.1461360454559326,
"learning_rate": 3.943802499883758e-05,
"loss": 1.0087,
"step": 11360
},
{
"epoch": 1.1392,
"grad_norm": 1.1354318857192993,
"learning_rate": 3.9192503422064384e-05,
"loss": 1.0062,
"step": 11392
},
{
"epoch": 1.1424,
"grad_norm": 1.3638827800750732,
"learning_rate": 3.89472549089487e-05,
"loss": 0.9556,
"step": 11424
},
{
"epoch": 1.1456,
"grad_norm": 1.1214622259140015,
"learning_rate": 3.870228565597229e-05,
"loss": 1.0778,
"step": 11456
},
{
"epoch": 1.1488,
"grad_norm": 1.1730358600616455,
"learning_rate": 3.8457601852561164e-05,
"loss": 0.9723,
"step": 11488
},
{
"epoch": 1.152,
"grad_norm": 1.0663352012634277,
"learning_rate": 3.821320968092912e-05,
"loss": 1.0043,
"step": 11520
},
{
"epoch": 1.1552,
"grad_norm": 2.856649875640869,
"learning_rate": 3.79691153159215e-05,
"loss": 1.0079,
"step": 11552
},
{
"epoch": 1.1584,
"grad_norm": 0.969799280166626,
"learning_rate": 3.7725324924859285e-05,
"loss": 0.9749,
"step": 11584
},
{
"epoch": 1.1616,
"grad_norm": 1.0517395734786987,
"learning_rate": 3.7481844667383146e-05,
"loss": 1.0013,
"step": 11616
},
{
"epoch": 1.1648,
"grad_norm": 1.0082898139953613,
"learning_rate": 3.7238680695297944e-05,
"loss": 1.0385,
"step": 11648
},
{
"epoch": 1.168,
"grad_norm": 0.9222135543823242,
"learning_rate": 3.699583915241717e-05,
"loss": 1.041,
"step": 11680
},
{
"epoch": 1.1712,
"grad_norm": 1.1111342906951904,
"learning_rate": 3.6753326174407835e-05,
"loss": 1.0354,
"step": 11712
},
{
"epoch": 1.1743999999999999,
"grad_norm": 0.977922797203064,
"learning_rate": 3.651114788863534e-05,
"loss": 0.8985,
"step": 11744
},
{
"epoch": 1.1776,
"grad_norm": 1.017745852470398,
"learning_rate": 3.626931041400871e-05,
"loss": 1.0436,
"step": 11776
},
{
"epoch": 1.1808,
"grad_norm": 1.5313466787338257,
"learning_rate": 3.602781986082603e-05,
"loss": 1.0054,
"step": 11808
},
{
"epoch": 1.184,
"grad_norm": 1.1083983182907104,
"learning_rate": 3.578668233061995e-05,
"loss": 0.937,
"step": 11840
},
{
"epoch": 1.1872,
"grad_norm": 1.449925184249878,
"learning_rate": 3.554590391600368e-05,
"loss": 1.0421,
"step": 11872
},
{
"epoch": 1.1904,
"grad_norm": 1.0687569379806519,
"learning_rate": 3.530549070051691e-05,
"loss": 1.1248,
"step": 11904
},
{
"epoch": 1.1936,
"grad_norm": 1.0051459074020386,
"learning_rate": 3.506544875847215e-05,
"loss": 1.0627,
"step": 11936
},
{
"epoch": 1.1968,
"grad_norm": 0.9372243285179138,
"learning_rate": 3.482578415480133e-05,
"loss": 0.9443,
"step": 11968
},
{
"epoch": 1.2,
"grad_norm": 1.2080271244049072,
"learning_rate": 3.458650294490243e-05,
"loss": 1.0279,
"step": 12000
},
{
"epoch": 1.2032,
"grad_norm": 0.9924854636192322,
"learning_rate": 3.4347611174486585e-05,
"loss": 0.9565,
"step": 12032
},
{
"epoch": 1.2064,
"grad_norm": 1.0313811302185059,
"learning_rate": 3.410911487942531e-05,
"loss": 0.9888,
"step": 12064
},
{
"epoch": 1.2096,
"grad_norm": 0.8139066100120544,
"learning_rate": 3.387102008559795e-05,
"loss": 0.9131,
"step": 12096
},
{
"epoch": 1.2128,
"grad_norm": 1.2079484462738037,
"learning_rate": 3.363333280873951e-05,
"loss": 0.969,
"step": 12128
},
{
"epoch": 1.216,
"grad_norm": 1.2186810970306396,
"learning_rate": 3.3396059054288556e-05,
"loss": 1.0107,
"step": 12160
},
{
"epoch": 1.2192,
"grad_norm": 1.5260050296783447,
"learning_rate": 3.3159204817235626e-05,
"loss": 1.0955,
"step": 12192
},
{
"epoch": 1.2224,
"grad_norm": 1.1795814037322998,
"learning_rate": 3.2922776081971577e-05,
"loss": 1.0834,
"step": 12224
},
{
"epoch": 1.2256,
"grad_norm": 1.3864374160766602,
"learning_rate": 3.268677882213657e-05,
"loss": 1.0377,
"step": 12256
},
{
"epoch": 1.2288000000000001,
"grad_norm": 1.0220489501953125,
"learning_rate": 3.2451219000469016e-05,
"loss": 1.0321,
"step": 12288
},
{
"epoch": 1.232,
"grad_norm": 0.8851804733276367,
"learning_rate": 3.22161025686549e-05,
"loss": 0.9517,
"step": 12320
},
{
"epoch": 1.2352,
"grad_norm": 0.9731053113937378,
"learning_rate": 3.198143546717758e-05,
"loss": 1.0006,
"step": 12352
},
{
"epoch": 1.2384,
"grad_norm": 1.3373504877090454,
"learning_rate": 3.1747223625167435e-05,
"loss": 1.0912,
"step": 12384
},
{
"epoch": 1.2416,
"grad_norm": 1.0071483850479126,
"learning_rate": 3.151347296025231e-05,
"loss": 0.9673,
"step": 12416
},
{
"epoch": 1.2448,
"grad_norm": 1.0482680797576904,
"learning_rate": 3.1280189378407845e-05,
"loss": 1.0046,
"step": 12448
},
{
"epoch": 1.248,
"grad_norm": 0.9654492735862732,
"learning_rate": 3.104737877380828e-05,
"loss": 0.9869,
"step": 12480
},
{
"epoch": 1.2511999999999999,
"grad_norm": 1.0645602941513062,
"learning_rate": 3.0815047028677565e-05,
"loss": 0.9156,
"step": 12512
},
{
"epoch": 1.2544,
"grad_norm": 1.0766998529434204,
"learning_rate": 3.058320001314071e-05,
"loss": 1.0591,
"step": 12544
},
{
"epoch": 1.2576,
"grad_norm": 1.0413891077041626,
"learning_rate": 3.035184358507549e-05,
"loss": 1.0322,
"step": 12576
},
{
"epoch": 1.2608,
"grad_norm": 0.9120105504989624,
"learning_rate": 3.012098358996448e-05,
"loss": 1.0726,
"step": 12608
},
{
"epoch": 1.264,
"grad_norm": 1.2529538869857788,
"learning_rate": 2.9890625860747224e-05,
"loss": 1.0475,
"step": 12640
},
{
"epoch": 1.2671999999999999,
"grad_norm": 1.0173494815826416,
"learning_rate": 2.9660776217673004e-05,
"loss": 0.9891,
"step": 12672
},
{
"epoch": 1.2704,
"grad_norm": 0.929347813129425,
"learning_rate": 2.9431440468153714e-05,
"loss": 1.053,
"step": 12704
},
{
"epoch": 1.2736,
"grad_norm": 1.123602032661438,
"learning_rate": 2.9202624406617163e-05,
"loss": 1.0327,
"step": 12736
},
{
"epoch": 1.2768,
"grad_norm": 1.1687569618225098,
"learning_rate": 2.8974333814360605e-05,
"loss": 0.9953,
"step": 12768
},
{
"epoch": 1.28,
"grad_norm": 1.5183366537094116,
"learning_rate": 2.8746574459404774e-05,
"loss": 1.1038,
"step": 12800
},
{
"epoch": 1.2832,
"grad_norm": 1.2359113693237305,
"learning_rate": 2.8519352096348086e-05,
"loss": 0.9681,
"step": 12832
},
{
"epoch": 1.2864,
"grad_norm": 1.1729321479797363,
"learning_rate": 2.8292672466221193e-05,
"loss": 1.0964,
"step": 12864
},
{
"epoch": 1.2896,
"grad_norm": 1.2479591369628906,
"learning_rate": 2.806654129634205e-05,
"loss": 0.9706,
"step": 12896
},
{
"epoch": 1.2928,
"grad_norm": 0.9051811099052429,
"learning_rate": 2.784096430017108e-05,
"loss": 1.0464,
"step": 12928
},
{
"epoch": 1.296,
"grad_norm": 1.0455889701843262,
"learning_rate": 2.7615947177166956e-05,
"loss": 1.0636,
"step": 12960
},
{
"epoch": 1.2992,
"grad_norm": 1.0110808610916138,
"learning_rate": 2.7391495612642447e-05,
"loss": 1.0624,
"step": 12992
},
{
"epoch": 1.3024,
"grad_norm": 1.0357776880264282,
"learning_rate": 2.7167615277620857e-05,
"loss": 1.0457,
"step": 13024
},
{
"epoch": 1.3056,
"grad_norm": 1.2811650037765503,
"learning_rate": 2.6944311828692782e-05,
"loss": 1.0167,
"step": 13056
},
{
"epoch": 1.3088,
"grad_norm": 1.0709502696990967,
"learning_rate": 2.672159090787307e-05,
"loss": 1.0547,
"step": 13088
},
{
"epoch": 1.312,
"grad_norm": 0.9260007739067078,
"learning_rate": 2.6499458142458376e-05,
"loss": 0.9556,
"step": 13120
},
{
"epoch": 1.3152,
"grad_norm": 1.2283986806869507,
"learning_rate": 2.6277919144884962e-05,
"loss": 1.0471,
"step": 13152
},
{
"epoch": 1.3184,
"grad_norm": 1.5820285081863403,
"learning_rate": 2.6056979512586786e-05,
"loss": 1.0746,
"step": 13184
},
{
"epoch": 1.3216,
"grad_norm": 1.4699950218200684,
"learning_rate": 2.5836644827854285e-05,
"loss": 1.0978,
"step": 13216
},
{
"epoch": 1.3248,
"grad_norm": 1.077929973602295,
"learning_rate": 2.5616920657693077e-05,
"loss": 1.0179,
"step": 13248
},
{
"epoch": 1.328,
"grad_norm": 1.126991629600525,
"learning_rate": 2.5397812553683552e-05,
"loss": 1.0385,
"step": 13280
},
{
"epoch": 1.3312,
"grad_norm": 1.0877243280410767,
"learning_rate": 2.5179326051840414e-05,
"loss": 1.0298,
"step": 13312
},
{
"epoch": 1.3344,
"grad_norm": 0.9541513323783875,
"learning_rate": 2.4961466672472933e-05,
"loss": 1.0621,
"step": 13344
},
{
"epoch": 1.3376000000000001,
"grad_norm": 1.2593402862548828,
"learning_rate": 2.4744239920045388e-05,
"loss": 1.0729,
"step": 13376
},
{
"epoch": 1.3408,
"grad_norm": 1.754684567451477,
"learning_rate": 2.4527651283038e-05,
"loss": 1.0269,
"step": 13408
},
{
"epoch": 1.3439999999999999,
"grad_norm": 1.3737800121307373,
"learning_rate": 2.4311706233808357e-05,
"loss": 0.9552,
"step": 13440
},
{
"epoch": 1.3472,
"grad_norm": 1.1643366813659668,
"learning_rate": 2.4096410228452974e-05,
"loss": 0.968,
"step": 13472
},
{
"epoch": 1.3504,
"grad_norm": 0.981141209602356,
"learning_rate": 2.388176870666962e-05,
"loss": 0.9737,
"step": 13504
},
{
"epoch": 1.3536000000000001,
"grad_norm": 0.9870623350143433,
"learning_rate": 2.3667787091619775e-05,
"loss": 1.0105,
"step": 13536
},
{
"epoch": 1.3568,
"grad_norm": 1.3019083738327026,
"learning_rate": 2.3454470789791577e-05,
"loss": 1.0373,
"step": 13568
},
{
"epoch": 1.3599999999999999,
"grad_norm": 1.1865559816360474,
"learning_rate": 2.3241825190863337e-05,
"loss": 1.0258,
"step": 13600
},
{
"epoch": 1.3632,
"grad_norm": 1.5122630596160889,
"learning_rate": 2.3029855667567237e-05,
"loss": 0.9408,
"step": 13632
},
{
"epoch": 1.3664,
"grad_norm": 1.4110867977142334,
"learning_rate": 2.2818567575553702e-05,
"loss": 1.0212,
"step": 13664
},
{
"epoch": 1.3696,
"grad_norm": 1.5669643878936768,
"learning_rate": 2.2607966253255958e-05,
"loss": 1.0016,
"step": 13696
},
{
"epoch": 1.3728,
"grad_norm": 0.9724251627922058,
"learning_rate": 2.2398057021755286e-05,
"loss": 1.0003,
"step": 13728
},
{
"epoch": 1.376,
"grad_norm": 1.1270085573196411,
"learning_rate": 2.218884518464645e-05,
"loss": 1.0016,
"step": 13760
},
{
"epoch": 1.3792,
"grad_norm": 1.1711382865905762,
"learning_rate": 2.1980336027903764e-05,
"loss": 0.9839,
"step": 13792
},
{
"epoch": 1.3824,
"grad_norm": 1.2853403091430664,
"learning_rate": 2.177253481974757e-05,
"loss": 0.9682,
"step": 13824
},
{
"epoch": 1.3856,
"grad_norm": 0.9878413081169128,
"learning_rate": 2.1565446810511015e-05,
"loss": 1.0713,
"step": 13856
},
{
"epoch": 1.3888,
"grad_norm": 0.9764726161956787,
"learning_rate": 2.135907723250752e-05,
"loss": 1.0536,
"step": 13888
},
{
"epoch": 1.392,
"grad_norm": 1.0668312311172485,
"learning_rate": 2.1153431299898535e-05,
"loss": 1.0154,
"step": 13920
},
{
"epoch": 1.3952,
"grad_norm": 0.9996973276138306,
"learning_rate": 2.0954906783923116e-05,
"loss": 1.0502,
"step": 13952
},
{
"epoch": 1.3984,
"grad_norm": 1.2554295063018799,
"learning_rate": 2.0750700695049847e-05,
"loss": 1.0054,
"step": 13984
},
{
"epoch": 1.4016,
"grad_norm": 1.2442419528961182,
"learning_rate": 2.0547233622894208e-05,
"loss": 1.0073,
"step": 14016
},
{
"epoch": 1.4048,
"grad_norm": 0.9537378549575806,
"learning_rate": 2.0344510708282556e-05,
"loss": 0.9257,
"step": 14048
},
{
"epoch": 1.408,
"grad_norm": 1.306130290031433,
"learning_rate": 2.0142537073239192e-05,
"loss": 0.973,
"step": 14080
},
{
"epoch": 1.4112,
"grad_norm": 1.0325103998184204,
"learning_rate": 1.9941317820857086e-05,
"loss": 0.963,
"step": 14112
},
{
"epoch": 1.4144,
"grad_norm": 1.262694001197815,
"learning_rate": 1.9740858035168857e-05,
"loss": 0.9933,
"step": 14144
},
{
"epoch": 1.4176,
"grad_norm": 1.0754705667495728,
"learning_rate": 1.9541162781018297e-05,
"loss": 0.9746,
"step": 14176
},
{
"epoch": 1.4208,
"grad_norm": 1.199257254600525,
"learning_rate": 1.934223710393249e-05,
"loss": 0.9835,
"step": 14208
},
{
"epoch": 1.424,
"grad_norm": 0.9726549983024597,
"learning_rate": 1.914408602999424e-05,
"loss": 0.9651,
"step": 14240
},
{
"epoch": 1.4272,
"grad_norm": 1.086094856262207,
"learning_rate": 1.8946714565715166e-05,
"loss": 1.0708,
"step": 14272
},
{
"epoch": 1.4304000000000001,
"grad_norm": 1.1396266222000122,
"learning_rate": 1.8750127697909154e-05,
"loss": 1.0532,
"step": 14304
},
{
"epoch": 1.4336,
"grad_norm": 0.9932637214660645,
"learning_rate": 1.8554330393566356e-05,
"loss": 1.0443,
"step": 14336
},
{
"epoch": 1.4368,
"grad_norm": 1.016258716583252,
"learning_rate": 1.8359327599727698e-05,
"loss": 1.0207,
"step": 14368
},
{
"epoch": 1.44,
"grad_norm": 1.2886885404586792,
"learning_rate": 1.816512424335991e-05,
"loss": 1.0197,
"step": 14400
},
{
"epoch": 1.4432,
"grad_norm": 0.940187931060791,
"learning_rate": 1.7971725231231044e-05,
"loss": 1.041,
"step": 14432
},
{
"epoch": 1.4464000000000001,
"grad_norm": 0.9931338429450989,
"learning_rate": 1.7779135449786482e-05,
"loss": 1.0637,
"step": 14464
},
{
"epoch": 1.4496,
"grad_norm": 1.3718934059143066,
"learning_rate": 1.7587359765025435e-05,
"loss": 1.0025,
"step": 14496
},
{
"epoch": 1.4527999999999999,
"grad_norm": 1.0311670303344727,
"learning_rate": 1.7396403022378095e-05,
"loss": 1.021,
"step": 14528
},
{
"epoch": 1.456,
"grad_norm": 1.155760407447815,
"learning_rate": 1.7206270046583085e-05,
"loss": 1.0201,
"step": 14560
},
{
"epoch": 1.4592,
"grad_norm": 1.2017810344696045,
"learning_rate": 1.7016965641565703e-05,
"loss": 1.0173,
"step": 14592
},
{
"epoch": 1.4624,
"grad_norm": 1.0423095226287842,
"learning_rate": 1.682849459031639e-05,
"loss": 1.0107,
"step": 14624
},
{
"epoch": 1.4656,
"grad_norm": 1.2119863033294678,
"learning_rate": 1.6640861654770012e-05,
"loss": 0.9798,
"step": 14656
},
{
"epoch": 1.4687999999999999,
"grad_norm": 1.078291893005371,
"learning_rate": 1.6454071575685488e-05,
"loss": 0.9613,
"step": 14688
},
{
"epoch": 1.472,
"grad_norm": 1.1082805395126343,
"learning_rate": 1.6268129072525983e-05,
"loss": 1.0376,
"step": 14720
},
{
"epoch": 1.4752,
"grad_norm": 1.5352367162704468,
"learning_rate": 1.6083038843339717e-05,
"loss": 1.0322,
"step": 14752
},
{
"epoch": 1.4784,
"grad_norm": 1.064229965209961,
"learning_rate": 1.589880556464121e-05,
"loss": 0.9783,
"step": 14784
},
{
"epoch": 1.4816,
"grad_norm": 1.3297624588012695,
"learning_rate": 1.5715433891293206e-05,
"loss": 1.0229,
"step": 14816
},
{
"epoch": 1.4848,
"grad_norm": 1.3579192161560059,
"learning_rate": 1.5532928456389e-05,
"loss": 0.9976,
"step": 14848
},
{
"epoch": 1.488,
"grad_norm": 1.59537935256958,
"learning_rate": 1.535129387113534e-05,
"loss": 1.0262,
"step": 14880
},
{
"epoch": 1.4912,
"grad_norm": 1.2281315326690674,
"learning_rate": 1.5170534724736058e-05,
"loss": 0.983,
"step": 14912
},
{
"epoch": 1.4944,
"grad_norm": 1.334694743156433,
"learning_rate": 1.4990655584275931e-05,
"loss": 1.0427,
"step": 14944
},
{
"epoch": 1.4976,
"grad_norm": 1.07719886302948,
"learning_rate": 1.4811660994605465e-05,
"loss": 0.9257,
"step": 14976
},
{
"epoch": 1.5008,
"grad_norm": 0.7927564978599548,
"learning_rate": 1.4633555478225974e-05,
"loss": 0.9359,
"step": 15008
},
{
"epoch": 1.504,
"grad_norm": 0.9496012926101685,
"learning_rate": 1.4456343535175276e-05,
"loss": 0.9869,
"step": 15040
},
{
"epoch": 1.5072,
"grad_norm": 1.1386374235153198,
"learning_rate": 1.4280029642914117e-05,
"loss": 1.0657,
"step": 15072
},
{
"epoch": 1.5104,
"grad_norm": 1.0637845993041992,
"learning_rate": 1.4104618256212926e-05,
"loss": 0.9873,
"step": 15104
},
{
"epoch": 1.5135999999999998,
"grad_norm": 0.9361437559127808,
"learning_rate": 1.3930113807039297e-05,
"loss": 1.0041,
"step": 15136
},
{
"epoch": 1.5168,
"grad_norm": 1.1009209156036377,
"learning_rate": 1.3756520704446068e-05,
"loss": 1.017,
"step": 15168
},
{
"epoch": 1.52,
"grad_norm": 1.3386567831039429,
"learning_rate": 1.3583843334459812e-05,
"loss": 1.0768,
"step": 15200
},
{
"epoch": 1.5232,
"grad_norm": 1.0036081075668335,
"learning_rate": 1.3412086059970141e-05,
"loss": 1.0356,
"step": 15232
},
{
"epoch": 1.5264,
"grad_norm": 1.0627269744873047,
"learning_rate": 1.3241253220619355e-05,
"loss": 1.0667,
"step": 15264
},
{
"epoch": 1.5295999999999998,
"grad_norm": 1.1370049715042114,
"learning_rate": 1.3071349132692895e-05,
"loss": 0.9148,
"step": 15296
},
{
"epoch": 1.5328,
"grad_norm": 1.1615465879440308,
"learning_rate": 1.2902378089010208e-05,
"loss": 1.0001,
"step": 15328
},
{
"epoch": 1.536,
"grad_norm": 1.2635860443115234,
"learning_rate": 1.2734344358816341e-05,
"loss": 0.9977,
"step": 15360
},
{
"epoch": 1.5392000000000001,
"grad_norm": 1.03976309299469,
"learning_rate": 1.2567252187674072e-05,
"loss": 0.9745,
"step": 15392
},
{
"epoch": 1.5424,
"grad_norm": 0.9221481084823608,
"learning_rate": 1.2401105797356566e-05,
"loss": 1.1097,
"step": 15424
},
{
"epoch": 1.5455999999999999,
"grad_norm": 1.0720359086990356,
"learning_rate": 1.2235909385740824e-05,
"loss": 1.0531,
"step": 15456
},
{
"epoch": 1.5488,
"grad_norm": 0.9620745778083801,
"learning_rate": 1.2071667126701514e-05,
"loss": 0.98,
"step": 15488
},
{
"epoch": 1.552,
"grad_norm": 3.6036980152130127,
"learning_rate": 1.1908383170005567e-05,
"loss": 1.0025,
"step": 15520
},
{
"epoch": 1.5552000000000001,
"grad_norm": 1.0159907341003418,
"learning_rate": 1.174606164120734e-05,
"loss": 1.0149,
"step": 15552
},
{
"epoch": 1.5584,
"grad_norm": 1.032041072845459,
"learning_rate": 1.1584706641544319e-05,
"loss": 1.0008,
"step": 15584
},
{
"epoch": 1.5615999999999999,
"grad_norm": 1.0777499675750732,
"learning_rate": 1.142432224783359e-05,
"loss": 1.0494,
"step": 15616
},
{
"epoch": 1.5648,
"grad_norm": 1.3203213214874268,
"learning_rate": 1.1264912512368714e-05,
"loss": 1.0905,
"step": 15648
},
{
"epoch": 1.568,
"grad_norm": 1.165831446647644,
"learning_rate": 1.110648146281747e-05,
"loss": 0.9783,
"step": 15680
},
{
"epoch": 1.5712000000000002,
"grad_norm": 1.350396752357483,
"learning_rate": 1.0949033102119966e-05,
"loss": 1.0549,
"step": 15712
},
{
"epoch": 1.5744,
"grad_norm": 0.9310358762741089,
"learning_rate": 1.0792571408387608e-05,
"loss": 0.9963,
"step": 15744
},
{
"epoch": 1.5776,
"grad_norm": 0.8703174591064453,
"learning_rate": 1.063710033480254e-05,
"loss": 0.9774,
"step": 15776
},
{
"epoch": 1.5808,
"grad_norm": 1.2223252058029175,
"learning_rate": 1.0482623809517727e-05,
"loss": 1.1114,
"step": 15808
},
{
"epoch": 1.584,
"grad_norm": 1.393531322479248,
"learning_rate": 1.0329145735557788e-05,
"loss": 1.006,
"step": 15840
},
{
"epoch": 1.5872000000000002,
"grad_norm": 1.0481770038604736,
"learning_rate": 1.0176669990720305e-05,
"loss": 1.0289,
"step": 15872
},
{
"epoch": 1.5904,
"grad_norm": 0.8939660787582397,
"learning_rate": 1.0025200427477876e-05,
"loss": 0.9696,
"step": 15904
},
{
"epoch": 1.5936,
"grad_norm": 0.986250102519989,
"learning_rate": 9.874740872880822e-06,
"loss": 1.0411,
"step": 15936
},
{
"epoch": 1.5968,
"grad_norm": 1.6278936862945557,
"learning_rate": 9.725295128460393e-06,
"loss": 0.9622,
"step": 15968
},
{
"epoch": 1.6,
"grad_norm": 1.066041350364685,
"learning_rate": 9.57686697013283e-06,
"loss": 0.9735,
"step": 16000
},
{
"epoch": 1.6032,
"grad_norm": 1.3626492023468018,
"learning_rate": 9.429460148103857e-06,
"loss": 1.0541,
"step": 16032
},
{
"epoch": 1.6064,
"grad_norm": 1.753517985343933,
"learning_rate": 9.283078386774025e-06,
"loss": 0.9463,
"step": 16064
},
{
"epoch": 1.6096,
"grad_norm": 1.5154976844787598,
"learning_rate": 9.137725384644513e-06,
"loss": 0.9803,
"step": 16096
},
{
"epoch": 1.6128,
"grad_norm": 1.3112058639526367,
"learning_rate": 8.99789916813244e-06,
"loss": 0.9787,
"step": 16128
},
{
"epoch": 1.616,
"grad_norm": 1.1479904651641846,
"learning_rate": 8.85458224346668e-06,
"loss": 0.9706,
"step": 16160
},
{
"epoch": 1.6192,
"grad_norm": 1.6259170770645142,
"learning_rate": 8.712304904442254e-06,
"loss": 0.9407,
"step": 16192
},
{
"epoch": 1.6223999999999998,
"grad_norm": 1.265771508216858,
"learning_rate": 8.571070745857496e-06,
"loss": 0.9805,
"step": 16224
},
{
"epoch": 1.6256,
"grad_norm": 1.184053897857666,
"learning_rate": 8.430883336153578e-06,
"loss": 0.8846,
"step": 16256
},
{
"epoch": 1.6288,
"grad_norm": 1.0688974857330322,
"learning_rate": 8.291746217324392e-06,
"loss": 0.8709,
"step": 16288
},
{
"epoch": 1.6320000000000001,
"grad_norm": 1.2258456945419312,
"learning_rate": 8.153662904827058e-06,
"loss": 0.8978,
"step": 16320
},
{
"epoch": 1.6352,
"grad_norm": 1.2160283327102661,
"learning_rate": 8.016636887493033e-06,
"loss": 1.0279,
"step": 16352
},
{
"epoch": 1.6383999999999999,
"grad_norm": 1.1910878419876099,
"learning_rate": 7.880671627440067e-06,
"loss": 0.9773,
"step": 16384
},
{
"epoch": 1.6416,
"grad_norm": 1.1277639865875244,
"learning_rate": 7.745770559984622e-06,
"loss": 1.0062,
"step": 16416
},
{
"epoch": 1.6448,
"grad_norm": 1.165391445159912,
"learning_rate": 7.611937093555182e-06,
"loss": 0.9795,
"step": 16448
},
{
"epoch": 1.6480000000000001,
"grad_norm": 1.623434066772461,
"learning_rate": 7.479174609606027e-06,
"loss": 1.0385,
"step": 16480
},
{
"epoch": 1.6512,
"grad_norm": 0.9764330387115479,
"learning_rate": 7.347486462531899e-06,
"loss": 1.0274,
"step": 16512
},
{
"epoch": 1.6543999999999999,
"grad_norm": 1.3088263273239136,
"learning_rate": 7.216875979583171e-06,
"loss": 0.975,
"step": 16544
},
{
"epoch": 1.6576,
"grad_norm": 1.1035621166229248,
"learning_rate": 7.0873464607817965e-06,
"loss": 1.0245,
"step": 16576
},
{
"epoch": 1.6608,
"grad_norm": 1.3052529096603394,
"learning_rate": 6.95890117883799e-06,
"loss": 1.048,
"step": 16608
},
{
"epoch": 1.6640000000000001,
"grad_norm": 1.2149062156677246,
"learning_rate": 6.8315433790674396e-06,
"loss": 0.9791,
"step": 16640
},
{
"epoch": 1.6672,
"grad_norm": 0.8300272226333618,
"learning_rate": 6.7052762793094085e-06,
"loss": 0.9096,
"step": 16672
},
{
"epoch": 1.6703999999999999,
"grad_norm": 1.1794402599334717,
"learning_rate": 6.580103069845367e-06,
"loss": 0.9758,
"step": 16704
},
{
"epoch": 1.6736,
"grad_norm": 1.0243241786956787,
"learning_rate": 6.456026913318397e-06,
"loss": 1.0208,
"step": 16736
},
{
"epoch": 1.6768,
"grad_norm": 1.0999525785446167,
"learning_rate": 6.3330509446533185e-06,
"loss": 1.0049,
"step": 16768
},
{
"epoch": 1.6800000000000002,
"grad_norm": 1.0554022789001465,
"learning_rate": 6.2111782709774244e-06,
"loss": 1.0107,
"step": 16800
},
{
"epoch": 1.6832,
"grad_norm": 0.9995157122612,
"learning_rate": 6.090411971542037e-06,
"loss": 1.0186,
"step": 16832
},
{
"epoch": 1.6864,
"grad_norm": 2.1441192626953125,
"learning_rate": 5.970755097644676e-06,
"loss": 0.9638,
"step": 16864
},
{
"epoch": 1.6896,
"grad_norm": 0.9487547278404236,
"learning_rate": 5.852210672551956e-06,
"loss": 0.8976,
"step": 16896
},
{
"epoch": 1.6928,
"grad_norm": 1.5148869752883911,
"learning_rate": 5.734781691423208e-06,
"loss": 1.05,
"step": 16928
},
{
"epoch": 1.696,
"grad_norm": 1.5390050411224365,
"learning_rate": 5.618471121234803e-06,
"loss": 1.0052,
"step": 16960
},
{
"epoch": 1.6992,
"grad_norm": 1.3450927734375,
"learning_rate": 5.503281900705226e-06,
"loss": 1.0556,
"step": 16992
},
{
"epoch": 1.7024,
"grad_norm": 1.5494674444198608,
"learning_rate": 5.389216940220743e-06,
"loss": 1.0307,
"step": 17024
},
{
"epoch": 1.7056,
"grad_norm": 1.4470009803771973,
"learning_rate": 5.276279121761946e-06,
"loss": 0.9268,
"step": 17056
},
{
"epoch": 1.7088,
"grad_norm": 1.1550896167755127,
"learning_rate": 5.164471298830908e-06,
"loss": 0.983,
"step": 17088
},
{
"epoch": 1.712,
"grad_norm": 1.681154727935791,
"learning_rate": 5.05379629637906e-06,
"loss": 1.0532,
"step": 17120
},
{
"epoch": 1.7151999999999998,
"grad_norm": 1.6035993099212646,
"learning_rate": 4.9442569107358675e-06,
"loss": 1.0431,
"step": 17152
},
{
"epoch": 1.7184,
"grad_norm": 0.9412198066711426,
"learning_rate": 4.835855909538111e-06,
"loss": 0.9953,
"step": 17184
},
{
"epoch": 1.7216,
"grad_norm": 1.1886653900146484,
"learning_rate": 4.728596031660032e-06,
"loss": 1.0136,
"step": 17216
},
{
"epoch": 1.7248,
"grad_norm": 1.0479758977890015,
"learning_rate": 4.622479987144096e-06,
"loss": 1.0124,
"step": 17248
},
{
"epoch": 1.728,
"grad_norm": 0.7669851779937744,
"learning_rate": 4.517510457132501e-06,
"loss": 1.0399,
"step": 17280
},
{
"epoch": 1.7311999999999999,
"grad_norm": 1.1323778629302979,
"learning_rate": 4.41369009379946e-06,
"loss": 1.0598,
"step": 17312
},
{
"epoch": 1.7344,
"grad_norm": 1.6534825563430786,
"learning_rate": 4.311021520284192e-06,
"loss": 1.0639,
"step": 17344
},
{
"epoch": 1.7376,
"grad_norm": 1.0984747409820557,
"learning_rate": 4.2095073306246404e-06,
"loss": 1.0477,
"step": 17376
},
{
"epoch": 1.7408000000000001,
"grad_norm": 1.204331636428833,
"learning_rate": 4.109150089691949e-06,
"loss": 1.0156,
"step": 17408
},
{
"epoch": 1.744,
"grad_norm": 1.214034914970398,
"learning_rate": 4.009952333125599e-06,
"loss": 0.9898,
"step": 17440
},
{
"epoch": 1.7471999999999999,
"grad_norm": 1.1294769048690796,
"learning_rate": 3.911916567269419e-06,
"loss": 1.0338,
"step": 17472
},
{
"epoch": 1.7504,
"grad_norm": 1.099940538406372,
"learning_rate": 3.815045269108208e-06,
"loss": 0.9481,
"step": 17504
},
{
"epoch": 1.7536,
"grad_norm": 0.9036813974380493,
"learning_rate": 3.7193408862051806e-06,
"loss": 1.0581,
"step": 17536
},
{
"epoch": 1.7568000000000001,
"grad_norm": 1.9300788640975952,
"learning_rate": 3.6248058366400884e-06,
"loss": 0.9628,
"step": 17568
},
{
"epoch": 1.76,
"grad_norm": 1.3298760652542114,
"learning_rate": 3.5314425089481795e-06,
"loss": 0.9576,
"step": 17600
},
{
"epoch": 1.7631999999999999,
"grad_norm": 0.9169399738311768,
"learning_rate": 3.4392532620598216e-06,
"loss": 0.909,
"step": 17632
},
{
"epoch": 1.7664,
"grad_norm": 2.3942863941192627,
"learning_rate": 3.348240425240873e-06,
"loss": 0.9977,
"step": 17664
},
{
"epoch": 1.7696,
"grad_norm": 1.1257165670394897,
"learning_rate": 3.258406298033867e-06,
"loss": 0.9888,
"step": 17696
},
{
"epoch": 1.7728000000000002,
"grad_norm": 1.5241613388061523,
"learning_rate": 3.1697531501999e-06,
"loss": 1.0051,
"step": 17728
},
{
"epoch": 1.776,
"grad_norm": 1.7748416662216187,
"learning_rate": 3.0822832216613084e-06,
"loss": 0.9575,
"step": 17760
},
{
"epoch": 1.7792,
"grad_norm": 1.1018184423446655,
"learning_rate": 2.995998722445026e-06,
"loss": 0.9552,
"step": 17792
},
{
"epoch": 1.7824,
"grad_norm": 0.9150426983833313,
"learning_rate": 2.9109018326267724e-06,
"loss": 1.0439,
"step": 17824
},
{
"epoch": 1.7856,
"grad_norm": 1.4982606172561646,
"learning_rate": 2.826994702275987e-06,
"loss": 0.9887,
"step": 17856
},
{
"epoch": 1.7888,
"grad_norm": 0.9846287369728088,
"learning_rate": 2.744279451401455e-06,
"loss": 1.0013,
"step": 17888
},
{
"epoch": 1.792,
"grad_norm": 1.326261043548584,
"learning_rate": 2.6627581698978222e-06,
"loss": 1.0104,
"step": 17920
},
{
"epoch": 1.7952,
"grad_norm": 1.1285284757614136,
"learning_rate": 2.5824329174926885e-06,
"loss": 0.9727,
"step": 17952
},
{
"epoch": 1.7984,
"grad_norm": 2.2517499923706055,
"learning_rate": 2.50330572369466e-06,
"loss": 1.0337,
"step": 17984
},
{
"epoch": 1.8016,
"grad_norm": 1.0541377067565918,
"learning_rate": 2.4253785877420386e-06,
"loss": 1.0562,
"step": 18016
},
{
"epoch": 1.8048,
"grad_norm": 1.0717918872833252,
"learning_rate": 2.348653478552276e-06,
"loss": 0.9698,
"step": 18048
},
{
"epoch": 1.808,
"grad_norm": 1.1820138692855835,
"learning_rate": 2.2731323346722677e-06,
"loss": 1.1274,
"step": 18080
},
{
"epoch": 1.8112,
"grad_norm": 2.2512147426605225,
"learning_rate": 2.1988170642293525e-06,
"loss": 0.9754,
"step": 18112
},
{
"epoch": 1.8144,
"grad_norm": 1.1447445154190063,
"learning_rate": 2.1257095448831256e-06,
"loss": 1.0539,
"step": 18144
},
{
"epoch": 1.8176,
"grad_norm": 1.3776092529296875,
"learning_rate": 2.0538116237779736e-06,
"loss": 1.0637,
"step": 18176
},
{
"epoch": 1.8208,
"grad_norm": 1.0882554054260254,
"learning_rate": 1.9831251174964037e-06,
"loss": 0.9859,
"step": 18208
},
{
"epoch": 1.8239999999999998,
"grad_norm": 1.1768244504928589,
"learning_rate": 1.913651812013173e-06,
"loss": 0.9577,
"step": 18240
},
{
"epoch": 1.8272,
"grad_norm": 1.1588401794433594,
"learning_rate": 1.8453934626501191e-06,
"loss": 0.9878,
"step": 18272
},
{
"epoch": 1.8304,
"grad_norm": 1.0095770359039307,
"learning_rate": 1.7783517940318517e-06,
"loss": 0.9374,
"step": 18304
},
{
"epoch": 1.8336000000000001,
"grad_norm": 1.5825436115264893,
"learning_rate": 1.7125285000421597e-06,
"loss": 1.0364,
"step": 18336
},
{
"epoch": 1.8368,
"grad_norm": 1.4219987392425537,
"learning_rate": 1.6499256118782503e-06,
"loss": 1.0511,
"step": 18368
},
{
"epoch": 1.8399999999999999,
"grad_norm": 1.5104936361312866,
"learning_rate": 1.5865058240322139e-06,
"loss": 0.9834,
"step": 18400
},
{
"epoch": 1.8432,
"grad_norm": 1.3107426166534424,
"learning_rate": 1.5243092580207507e-06,
"loss": 1.0268,
"step": 18432
},
{
"epoch": 1.8464,
"grad_norm": 1.1371307373046875,
"learning_rate": 1.463337485310634e-06,
"loss": 1.0525,
"step": 18464
},
{
"epoch": 1.8496000000000001,
"grad_norm": 0.9265105724334717,
"learning_rate": 1.4035920464228526e-06,
"loss": 1.0258,
"step": 18496
},
{
"epoch": 1.8528,
"grad_norm": 1.02960205078125,
"learning_rate": 1.3450744508936686e-06,
"loss": 1.0408,
"step": 18528
},
{
"epoch": 1.8559999999999999,
"grad_norm": 1.0141865015029907,
"learning_rate": 1.2877861772365108e-06,
"loss": 0.9408,
"step": 18560
},
{
"epoch": 1.8592,
"grad_norm": 1.0903533697128296,
"learning_rate": 1.2317286729045586e-06,
"loss": 1.0014,
"step": 18592
},
{
"epoch": 1.8624,
"grad_norm": 1.0181169509887695,
"learning_rate": 1.1769033542542552e-06,
"loss": 0.9902,
"step": 18624
},
{
"epoch": 1.8656000000000001,
"grad_norm": 1.112073540687561,
"learning_rate": 1.1233116065094362e-06,
"loss": 0.9636,
"step": 18656
},
{
"epoch": 1.8688,
"grad_norm": 1.0665274858474731,
"learning_rate": 1.0709547837263966e-06,
"loss": 0.9581,
"step": 18688
},
{
"epoch": 1.8719999999999999,
"grad_norm": 0.9692000150680542,
"learning_rate": 1.019834208759629e-06,
"loss": 0.9085,
"step": 18720
},
{
"epoch": 1.8752,
"grad_norm": 1.0520868301391602,
"learning_rate": 9.699511732284393e-07,
"loss": 0.9865,
"step": 18752
},
{
"epoch": 1.8784,
"grad_norm": 1.3581960201263428,
"learning_rate": 9.213069374842953e-07,
"loss": 1.1286,
"step": 18784
},
{
"epoch": 1.8816000000000002,
"grad_norm": 1.0845204591751099,
"learning_rate": 8.739027305789683e-07,
"loss": 1.1132,
"step": 18816
},
{
"epoch": 1.8848,
"grad_norm": 0.9488012790679932,
"learning_rate": 8.277397502335194e-07,
"loss": 1.0191,
"step": 18848
},
{
"epoch": 1.888,
"grad_norm": 1.2828950881958008,
"learning_rate": 7.82819162807985e-07,
"loss": 1.0149,
"step": 18880
},
{
"epoch": 1.8912,
"grad_norm": 1.0246376991271973,
"learning_rate": 7.391421032719559e-07,
"loss": 1.0022,
"step": 18912
},
{
"epoch": 1.8944,
"grad_norm": 1.1224380731582642,
"learning_rate": 6.967096751758773e-07,
"loss": 0.9888,
"step": 18944
},
{
"epoch": 1.8976,
"grad_norm": 1.0227912664413452,
"learning_rate": 6.555229506231608e-07,
"loss": 1.052,
"step": 18976
},
{
"epoch": 1.9008,
"grad_norm": 1.1570532321929932,
"learning_rate": 6.15582970243117e-07,
"loss": 0.9705,
"step": 19008
},
{
"epoch": 1.904,
"grad_norm": 1.1749358177185059,
"learning_rate": 5.76890743164632e-07,
"loss": 0.988,
"step": 19040
},
{
"epoch": 1.9072,
"grad_norm": 1.036882758140564,
"learning_rate": 5.394472469907208e-07,
"loss": 0.9823,
"step": 19072
},
{
"epoch": 1.9104,
"grad_norm": 0.9629050493240356,
"learning_rate": 5.032534277737643e-07,
"loss": 1.0453,
"step": 19104
},
{
"epoch": 1.9136,
"grad_norm": 1.1531473398208618,
"learning_rate": 4.6831019999165617e-07,
"loss": 0.9428,
"step": 19136
},
{
"epoch": 1.9167999999999998,
"grad_norm": 1.4971739053726196,
"learning_rate": 4.3461844652467607e-07,
"loss": 1.0141,
"step": 19168
},
{
"epoch": 1.92,
"grad_norm": 1.0155481100082397,
"learning_rate": 4.021790186331753e-07,
"loss": 1.1105,
"step": 19200
},
{
"epoch": 1.9232,
"grad_norm": 1.2783689498901367,
"learning_rate": 3.709927359360932e-07,
"loss": 0.9969,
"step": 19232
},
{
"epoch": 1.9264000000000001,
"grad_norm": 1.723365306854248,
"learning_rate": 3.410603863902406e-07,
"loss": 0.9682,
"step": 19264
},
{
"epoch": 1.9296,
"grad_norm": 0.9992212653160095,
"learning_rate": 3.123827262703549e-07,
"loss": 0.9813,
"step": 19296
},
{
"epoch": 1.9327999999999999,
"grad_norm": 1.958424687385559,
"learning_rate": 2.849604801500538e-07,
"loss": 0.9899,
"step": 19328
},
{
"epoch": 1.936,
"grad_norm": 1.2963917255401611,
"learning_rate": 2.5879434088348366e-07,
"loss": 1.0372,
"step": 19360
},
{
"epoch": 1.9392,
"grad_norm": 1.067751169204712,
"learning_rate": 2.3388496958782202e-07,
"loss": 1.0061,
"step": 19392
},
{
"epoch": 1.9424000000000001,
"grad_norm": 1.1410748958587646,
"learning_rate": 2.1023299562658583e-07,
"loss": 0.9716,
"step": 19424
},
{
"epoch": 1.9456,
"grad_norm": 1.5528861284255981,
"learning_rate": 1.878390165937216e-07,
"loss": 0.9582,
"step": 19456
},
{
"epoch": 1.9487999999999999,
"grad_norm": 1.1877301931381226,
"learning_rate": 1.6670359829850657e-07,
"loss": 1.0267,
"step": 19488
},
{
"epoch": 1.952,
"grad_norm": 1.0313178300857544,
"learning_rate": 1.468272747512489e-07,
"loss": 1.0209,
"step": 19520
},
{
"epoch": 1.9552,
"grad_norm": 1.2904688119888306,
"learning_rate": 1.282105481498097e-07,
"loss": 0.9588,
"step": 19552
},
{
"epoch": 1.9584000000000001,
"grad_norm": 0.9333122968673706,
"learning_rate": 1.1085388886689085e-07,
"loss": 1.034,
"step": 19584
},
{
"epoch": 1.9616,
"grad_norm": 0.9121220111846924,
"learning_rate": 9.475773543818344e-08,
"loss": 1.0108,
"step": 19616
},
{
"epoch": 1.9647999999999999,
"grad_norm": 0.8525537252426147,
"learning_rate": 7.99224945512489e-08,
"loss": 0.9451,
"step": 19648
},
{
"epoch": 1.968,
"grad_norm": 1.2468888759613037,
"learning_rate": 6.63485410352771e-08,
"loss": 0.9831,
"step": 19680
},
{
"epoch": 1.9712,
"grad_norm": 1.024377465248108,
"learning_rate": 5.4036217851594075e-08,
"loss": 0.9527,
"step": 19712
},
{
"epoch": 1.9744000000000002,
"grad_norm": 1.2195112705230713,
"learning_rate": 4.2985836085013275e-08,
"loss": 1.0317,
"step": 19744
},
{
"epoch": 1.9776,
"grad_norm": 0.9469048976898193,
"learning_rate": 3.31976749359586e-08,
"loss": 1.0474,
"step": 19776
},
{
"epoch": 1.9808,
"grad_norm": 2.0492160320281982,
"learning_rate": 2.467198171342e-08,
"loss": 1.0066,
"step": 19808
},
{
"epoch": 1.984,
"grad_norm": 1.0785330533981323,
"learning_rate": 1.7408971828714038e-08,
"loss": 0.962,
"step": 19840
},
{
"epoch": 1.9872,
"grad_norm": 1.0612059831619263,
"learning_rate": 1.1408828790010484e-08,
"loss": 0.9678,
"step": 19872
},
{
"epoch": 1.9904,
"grad_norm": 0.8084181547164917,
"learning_rate": 6.671704197735995e-09,
"loss": 0.9629,
"step": 19904
},
{
"epoch": 1.9936,
"grad_norm": 1.1087327003479004,
"learning_rate": 3.1977177407105372e-09,
"loss": 1.0333,
"step": 19936
},
{
"epoch": 1.9968,
"grad_norm": 1.4450087547302246,
"learning_rate": 9.869571931442334e-10,
"loss": 0.9335,
"step": 19968
},
{
"epoch": 2.0,
"grad_norm": 1.318145751953125,
"learning_rate": 3.9478412411364515e-11,
"loss": 0.9543,
"step": 20000
}
],
"logging_steps": 32,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.082339299367977e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}