dumb-dev's picture
at least i generates some kind of output now...
8bb1997 verified
raw
history blame contribute delete
No virus
53.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032,
"grad_norm": 0.7325745224952698,
"learning_rate": 9.99993683466483e-05,
"loss": 1.3543,
"step": 32
},
{
"epoch": 0.0064,
"grad_norm": 0.8166712522506714,
"learning_rate": 9.999747340255259e-05,
"loss": 1.2915,
"step": 64
},
{
"epoch": 0.0096,
"grad_norm": 1.328125238418579,
"learning_rate": 9.999431521559082e-05,
"loss": 1.2584,
"step": 96
},
{
"epoch": 0.0128,
"grad_norm": 1.0592037439346313,
"learning_rate": 9.998989386555814e-05,
"loss": 1.2065,
"step": 128
},
{
"epoch": 0.016,
"grad_norm": 1.292716145515442,
"learning_rate": 9.9984209464165e-05,
"loss": 1.1616,
"step": 160
},
{
"epoch": 0.0192,
"grad_norm": 1.1350401639938354,
"learning_rate": 9.997726215503422e-05,
"loss": 1.1175,
"step": 192
},
{
"epoch": 0.0224,
"grad_norm": 1.0363103151321411,
"learning_rate": 9.996905211369748e-05,
"loss": 1.1306,
"step": 224
},
{
"epoch": 0.0256,
"grad_norm": 1.3373076915740967,
"learning_rate": 9.995957954759071e-05,
"loss": 1.1809,
"step": 256
},
{
"epoch": 0.0288,
"grad_norm": 1.1969462633132935,
"learning_rate": 9.994884469604912e-05,
"loss": 1.2335,
"step": 288
},
{
"epoch": 0.032,
"grad_norm": 1.1673309803009033,
"learning_rate": 9.993684783030088e-05,
"loss": 1.1687,
"step": 320
},
{
"epoch": 0.0352,
"grad_norm": 1.849138617515564,
"learning_rate": 9.99235892534604e-05,
"loss": 1.1873,
"step": 352
},
{
"epoch": 0.0384,
"grad_norm": 1.3689218759536743,
"learning_rate": 9.990906930052064e-05,
"loss": 1.093,
"step": 384
},
{
"epoch": 0.0416,
"grad_norm": 1.2357085943222046,
"learning_rate": 9.989328833834471e-05,
"loss": 1.1549,
"step": 416
},
{
"epoch": 0.0448,
"grad_norm": 1.5436211824417114,
"learning_rate": 9.987624676565652e-05,
"loss": 1.1943,
"step": 448
},
{
"epoch": 0.048,
"grad_norm": 1.1809171438217163,
"learning_rate": 9.98579450130307e-05,
"loss": 1.1305,
"step": 480
},
{
"epoch": 0.0512,
"grad_norm": 1.1661288738250732,
"learning_rate": 9.983838354288181e-05,
"loss": 1.0564,
"step": 512
},
{
"epoch": 0.0544,
"grad_norm": 2.7672770023345947,
"learning_rate": 9.981756284945256e-05,
"loss": 1.1576,
"step": 544
},
{
"epoch": 0.0576,
"grad_norm": 1.6773067712783813,
"learning_rate": 9.979548345880141e-05,
"loss": 1.0685,
"step": 576
},
{
"epoch": 0.0608,
"grad_norm": 1.1834300756454468,
"learning_rate": 9.977214592878916e-05,
"loss": 1.107,
"step": 608
},
{
"epoch": 0.064,
"grad_norm": 1.0704506635665894,
"learning_rate": 9.974755084906502e-05,
"loss": 1.1127,
"step": 640
},
{
"epoch": 0.0672,
"grad_norm": 2.3311216831207275,
"learning_rate": 9.972169884105153e-05,
"loss": 1.1119,
"step": 672
},
{
"epoch": 0.0704,
"grad_norm": 1.1934360265731812,
"learning_rate": 9.969459055792903e-05,
"loss": 1.1084,
"step": 704
},
{
"epoch": 0.0736,
"grad_norm": 1.8318133354187012,
"learning_rate": 9.9666226684619e-05,
"loss": 1.1249,
"step": 736
},
{
"epoch": 0.0768,
"grad_norm": 1.3005311489105225,
"learning_rate": 9.963660793776688e-05,
"loss": 1.0904,
"step": 768
},
{
"epoch": 0.08,
"grad_norm": 1.3241008520126343,
"learning_rate": 9.96057350657239e-05,
"loss": 1.0616,
"step": 800
},
{
"epoch": 0.0832,
"grad_norm": 2.055724620819092,
"learning_rate": 9.957360884852817e-05,
"loss": 1.1737,
"step": 832
},
{
"epoch": 0.0864,
"grad_norm": 1.22575044631958,
"learning_rate": 9.954023009788504e-05,
"loss": 1.0874,
"step": 864
},
{
"epoch": 0.0896,
"grad_norm": 1.0968865156173706,
"learning_rate": 9.950559965714648e-05,
"loss": 1.0928,
"step": 896
},
{
"epoch": 0.0928,
"grad_norm": 1.220700740814209,
"learning_rate": 9.946971840128981e-05,
"loss": 1.1083,
"step": 928
},
{
"epoch": 0.096,
"grad_norm": 2.3711702823638916,
"learning_rate": 9.94325872368957e-05,
"loss": 1.1401,
"step": 960
},
{
"epoch": 0.0992,
"grad_norm": 1.261365532875061,
"learning_rate": 9.939420710212511e-05,
"loss": 1.159,
"step": 992
},
{
"epoch": 0.1024,
"grad_norm": 1.2131611108779907,
"learning_rate": 9.935457896669568e-05,
"loss": 1.1364,
"step": 1024
},
{
"epoch": 0.1056,
"grad_norm": 1.0336030721664429,
"learning_rate": 9.931370383185718e-05,
"loss": 1.0561,
"step": 1056
},
{
"epoch": 0.1088,
"grad_norm": 2.2293291091918945,
"learning_rate": 9.927158273036625e-05,
"loss": 1.121,
"step": 1088
},
{
"epoch": 0.112,
"grad_norm": 0.9839213490486145,
"learning_rate": 9.922821672646027e-05,
"loss": 1.1557,
"step": 1120
},
{
"epoch": 0.1152,
"grad_norm": 1.3354933261871338,
"learning_rate": 9.918360691583056e-05,
"loss": 1.0198,
"step": 1152
},
{
"epoch": 0.1184,
"grad_norm": 1.2504842281341553,
"learning_rate": 9.913775442559452e-05,
"loss": 1.0997,
"step": 1184
},
{
"epoch": 0.1216,
"grad_norm": 1.236770749092102,
"learning_rate": 9.909066041426733e-05,
"loss": 1.1579,
"step": 1216
},
{
"epoch": 0.1248,
"grad_norm": 1.1531703472137451,
"learning_rate": 9.904232607173262e-05,
"loss": 1.1022,
"step": 1248
},
{
"epoch": 0.128,
"grad_norm": 1.2251778841018677,
"learning_rate": 9.899275261921234e-05,
"loss": 1.1239,
"step": 1280
},
{
"epoch": 0.1312,
"grad_norm": 1.0417462587356567,
"learning_rate": 9.894194130923602e-05,
"loss": 1.1896,
"step": 1312
},
{
"epoch": 0.1344,
"grad_norm": 1.3808753490447998,
"learning_rate": 9.888989342560899e-05,
"loss": 1.096,
"step": 1344
},
{
"epoch": 0.1376,
"grad_norm": 1.349967360496521,
"learning_rate": 9.883661028338008e-05,
"loss": 1.077,
"step": 1376
},
{
"epoch": 0.1408,
"grad_norm": 2.028076648712158,
"learning_rate": 9.87820932288083e-05,
"loss": 1.0932,
"step": 1408
},
{
"epoch": 0.144,
"grad_norm": 1.088742971420288,
"learning_rate": 9.872634363932887e-05,
"loss": 1.1665,
"step": 1440
},
{
"epoch": 0.1472,
"grad_norm": 1.1702725887298584,
"learning_rate": 9.866936292351836e-05,
"loss": 1.058,
"step": 1472
},
{
"epoch": 0.1504,
"grad_norm": 1.2243529558181763,
"learning_rate": 9.861115252105921e-05,
"loss": 1.131,
"step": 1504
},
{
"epoch": 0.1536,
"grad_norm": 1.4041122198104858,
"learning_rate": 9.855171390270324e-05,
"loss": 1.0844,
"step": 1536
},
{
"epoch": 0.1568,
"grad_norm": 1.4032260179519653,
"learning_rate": 9.849104857023455e-05,
"loss": 1.1046,
"step": 1568
},
{
"epoch": 0.16,
"grad_norm": 2.4822256565093994,
"learning_rate": 9.842915805643155e-05,
"loss": 1.0779,
"step": 1600
},
{
"epoch": 0.1632,
"grad_norm": 1.7823238372802734,
"learning_rate": 9.83660439250283e-05,
"loss": 1.0,
"step": 1632
},
{
"epoch": 0.1664,
"grad_norm": 1.3723595142364502,
"learning_rate": 9.830170777067485e-05,
"loss": 1.0838,
"step": 1664
},
{
"epoch": 0.1696,
"grad_norm": 1.4893419742584229,
"learning_rate": 9.823615121889716e-05,
"loss": 1.0734,
"step": 1696
},
{
"epoch": 0.1728,
"grad_norm": 1.2856264114379883,
"learning_rate": 9.816937592605579e-05,
"loss": 1.0497,
"step": 1728
},
{
"epoch": 0.176,
"grad_norm": 1.2529082298278809,
"learning_rate": 9.81013835793043e-05,
"loss": 1.0624,
"step": 1760
},
{
"epoch": 0.1792,
"grad_norm": 1.107729196548462,
"learning_rate": 9.80321758965464e-05,
"loss": 1.0491,
"step": 1792
},
{
"epoch": 0.1824,
"grad_norm": 0.9313052296638489,
"learning_rate": 9.796175462639272e-05,
"loss": 1.1561,
"step": 1824
},
{
"epoch": 0.1856,
"grad_norm": 1.3460460901260376,
"learning_rate": 9.789012154811647e-05,
"loss": 1.0803,
"step": 1856
},
{
"epoch": 0.1888,
"grad_norm": 1.5794706344604492,
"learning_rate": 9.781727847160865e-05,
"loss": 1.0698,
"step": 1888
},
{
"epoch": 0.192,
"grad_norm": 1.2449215650558472,
"learning_rate": 9.774322723733216e-05,
"loss": 1.103,
"step": 1920
},
{
"epoch": 0.1952,
"grad_norm": 1.1799278259277344,
"learning_rate": 9.766796971627543e-05,
"loss": 1.0284,
"step": 1952
},
{
"epoch": 0.1984,
"grad_norm": 1.1231826543807983,
"learning_rate": 9.759150780990507e-05,
"loss": 1.0863,
"step": 1984
},
{
"epoch": 0.2016,
"grad_norm": 0.9852601289749146,
"learning_rate": 9.751384345011787e-05,
"loss": 1.0038,
"step": 2016
},
{
"epoch": 0.2048,
"grad_norm": 1.560398817062378,
"learning_rate": 9.743497859919196e-05,
"loss": 1.0669,
"step": 2048
},
{
"epoch": 0.208,
"grad_norm": 1.0659574270248413,
"learning_rate": 9.735491524973722e-05,
"loss": 1.1653,
"step": 2080
},
{
"epoch": 0.2112,
"grad_norm": 1.3178914785385132,
"learning_rate": 9.727365542464497e-05,
"loss": 1.0349,
"step": 2112
},
{
"epoch": 0.2144,
"grad_norm": 1.0162935256958008,
"learning_rate": 9.719379593129512e-05,
"loss": 1.1365,
"step": 2144
},
{
"epoch": 0.2176,
"grad_norm": 1.0954954624176025,
"learning_rate": 9.711018657323799e-05,
"loss": 1.0986,
"step": 2176
},
{
"epoch": 0.2208,
"grad_norm": 1.1444238424301147,
"learning_rate": 9.702538692289092e-05,
"loss": 1.1172,
"step": 2208
},
{
"epoch": 0.224,
"grad_norm": 1.112743616104126,
"learning_rate": 9.693939912281324e-05,
"loss": 1.0753,
"step": 2240
},
{
"epoch": 0.2272,
"grad_norm": 2.3814074993133545,
"learning_rate": 9.685222534558421e-05,
"loss": 1.0974,
"step": 2272
},
{
"epoch": 0.2304,
"grad_norm": 1.231828212738037,
"learning_rate": 9.676386779374819e-05,
"loss": 1.065,
"step": 2304
},
{
"epoch": 0.2336,
"grad_norm": 1.3037365674972534,
"learning_rate": 9.667432869975897e-05,
"loss": 1.0593,
"step": 2336
},
{
"epoch": 0.2368,
"grad_norm": 0.9929208755493164,
"learning_rate": 9.658361032592323e-05,
"loss": 1.0158,
"step": 2368
},
{
"epoch": 0.24,
"grad_norm": 0.9741840362548828,
"learning_rate": 9.649171496434361e-05,
"loss": 1.1219,
"step": 2400
},
{
"epoch": 0.2432,
"grad_norm": 1.1726874113082886,
"learning_rate": 9.639864493686061e-05,
"loss": 1.1151,
"step": 2432
},
{
"epoch": 0.2464,
"grad_norm": 0.9950255751609802,
"learning_rate": 9.630440259499406e-05,
"loss": 1.0553,
"step": 2464
},
{
"epoch": 0.2496,
"grad_norm": 1.0302757024765015,
"learning_rate": 9.620899031988359e-05,
"loss": 0.9945,
"step": 2496
},
{
"epoch": 0.2528,
"grad_norm": 1.5070362091064453,
"learning_rate": 9.611241052222852e-05,
"loss": 1.0367,
"step": 2528
},
{
"epoch": 0.256,
"grad_norm": 1.3058743476867676,
"learning_rate": 9.601466564222697e-05,
"loss": 1.0692,
"step": 2560
},
{
"epoch": 0.2592,
"grad_norm": 1.237705111503601,
"learning_rate": 9.591575814951419e-05,
"loss": 1.0626,
"step": 2592
},
{
"epoch": 0.2624,
"grad_norm": 2.5512192249298096,
"learning_rate": 9.581569054310016e-05,
"loss": 1.0316,
"step": 2624
},
{
"epoch": 0.2656,
"grad_norm": 1.5230085849761963,
"learning_rate": 9.571446535130641e-05,
"loss": 1.1311,
"step": 2656
},
{
"epoch": 0.2688,
"grad_norm": 0.9144539833068848,
"learning_rate": 9.561208513170223e-05,
"loss": 1.0661,
"step": 2688
},
{
"epoch": 0.272,
"grad_norm": 1.3663253784179688,
"learning_rate": 9.550855247103998e-05,
"loss": 1.0214,
"step": 2720
},
{
"epoch": 0.2752,
"grad_norm": 1.175469994544983,
"learning_rate": 9.540386998518972e-05,
"loss": 1.0807,
"step": 2752
},
{
"epoch": 0.2784,
"grad_norm": 0.9978043437004089,
"learning_rate": 9.529804031907319e-05,
"loss": 0.9998,
"step": 2784
},
{
"epoch": 0.2816,
"grad_norm": 1.9085396528244019,
"learning_rate": 9.519106614659692e-05,
"loss": 1.0589,
"step": 2816
},
{
"epoch": 0.2848,
"grad_norm": 1.1169906854629517,
"learning_rate": 9.50829501705847e-05,
"loss": 1.0892,
"step": 2848
},
{
"epoch": 0.288,
"grad_norm": 1.0185884237289429,
"learning_rate": 9.497369512270926e-05,
"loss": 1.1336,
"step": 2880
},
{
"epoch": 0.2912,
"grad_norm": 1.0060242414474487,
"learning_rate": 9.48633037634233e-05,
"loss": 1.0556,
"step": 2912
},
{
"epoch": 0.2944,
"grad_norm": 1.1675645112991333,
"learning_rate": 9.475177888188969e-05,
"loss": 1.1435,
"step": 2944
},
{
"epoch": 0.2976,
"grad_norm": 1.0665452480316162,
"learning_rate": 9.463912329591105e-05,
"loss": 1.0272,
"step": 2976
},
{
"epoch": 0.3008,
"grad_norm": 0.9090532064437866,
"learning_rate": 9.452533985185852e-05,
"loss": 1.0426,
"step": 3008
},
{
"epoch": 0.304,
"grad_norm": 0.8997248411178589,
"learning_rate": 9.441043142459985e-05,
"loss": 1.0359,
"step": 3040
},
{
"epoch": 0.3072,
"grad_norm": 1.1093074083328247,
"learning_rate": 9.429440091742676e-05,
"loss": 0.9781,
"step": 3072
},
{
"epoch": 0.3104,
"grad_norm": 1.2096401453018188,
"learning_rate": 9.41772512619816e-05,
"loss": 1.0604,
"step": 3104
},
{
"epoch": 0.3136,
"grad_norm": 0.9333838820457458,
"learning_rate": 9.405898541818329e-05,
"loss": 1.0607,
"step": 3136
},
{
"epoch": 0.3168,
"grad_norm": 1.3407750129699707,
"learning_rate": 9.393960637415248e-05,
"loss": 1.0114,
"step": 3168
},
{
"epoch": 0.32,
"grad_norm": 2.159203290939331,
"learning_rate": 9.38191171461361e-05,
"loss": 1.0373,
"step": 3200
},
{
"epoch": 0.3232,
"grad_norm": 1.2726435661315918,
"learning_rate": 9.369752077843114e-05,
"loss": 1.1084,
"step": 3232
},
{
"epoch": 0.3264,
"grad_norm": 1.0973173379898071,
"learning_rate": 9.357482034330775e-05,
"loss": 0.9722,
"step": 3264
},
{
"epoch": 0.3296,
"grad_norm": 1.1249974966049194,
"learning_rate": 9.345101894093154e-05,
"loss": 1.0646,
"step": 3296
},
{
"epoch": 0.3328,
"grad_norm": 1.109535574913025,
"learning_rate": 9.332611969928536e-05,
"loss": 1.0296,
"step": 3328
},
{
"epoch": 0.336,
"grad_norm": 0.9725342392921448,
"learning_rate": 9.32001257740902e-05,
"loss": 1.0389,
"step": 3360
},
{
"epoch": 0.3392,
"grad_norm": 1.2341935634613037,
"learning_rate": 9.307304034872545e-05,
"loss": 1.064,
"step": 3392
},
{
"epoch": 0.3424,
"grad_norm": 0.9506546854972839,
"learning_rate": 9.294486663414851e-05,
"loss": 1.089,
"step": 3424
},
{
"epoch": 0.3456,
"grad_norm": 1.4520303010940552,
"learning_rate": 9.281560786881363e-05,
"loss": 1.0143,
"step": 3456
},
{
"epoch": 0.3488,
"grad_norm": 1.069808006286621,
"learning_rate": 9.268526731859013e-05,
"loss": 1.0328,
"step": 3488
},
{
"epoch": 0.352,
"grad_norm": 0.9005358815193176,
"learning_rate": 9.25538482766798e-05,
"loss": 1.0842,
"step": 3520
},
{
"epoch": 0.3552,
"grad_norm": 0.9558689594268799,
"learning_rate": 9.242135406353378e-05,
"loss": 0.9927,
"step": 3552
},
{
"epoch": 0.3584,
"grad_norm": 1.1002886295318604,
"learning_rate": 9.228778802676863e-05,
"loss": 1.1007,
"step": 3584
},
{
"epoch": 0.3616,
"grad_norm": 1.1814830303192139,
"learning_rate": 9.215315354108174e-05,
"loss": 1.0102,
"step": 3616
},
{
"epoch": 0.3648,
"grad_norm": 1.4370577335357666,
"learning_rate": 9.201745400816606e-05,
"loss": 1.0723,
"step": 3648
},
{
"epoch": 0.368,
"grad_norm": 1.0058218240737915,
"learning_rate": 9.18806928566242e-05,
"loss": 1.1156,
"step": 3680
},
{
"epoch": 0.3712,
"grad_norm": 1.2105575799942017,
"learning_rate": 9.174287354188174e-05,
"loss": 1.0626,
"step": 3712
},
{
"epoch": 0.3744,
"grad_norm": 0.8971224427223206,
"learning_rate": 9.160399954609997e-05,
"loss": 1.1357,
"step": 3744
},
{
"epoch": 0.3776,
"grad_norm": 1.018344521522522,
"learning_rate": 9.146407437808788e-05,
"loss": 1.1171,
"step": 3776
},
{
"epoch": 0.3808,
"grad_norm": 1.098207712173462,
"learning_rate": 9.132310157321354e-05,
"loss": 1.0556,
"step": 3808
},
{
"epoch": 0.384,
"grad_norm": 1.0569736957550049,
"learning_rate": 9.11810846933147e-05,
"loss": 0.9559,
"step": 3840
},
{
"epoch": 0.3872,
"grad_norm": 1.0195281505584717,
"learning_rate": 9.103802732660894e-05,
"loss": 1.0586,
"step": 3872
},
{
"epoch": 0.3904,
"grad_norm": 1.4709314107894897,
"learning_rate": 9.089393308760283e-05,
"loss": 1.0509,
"step": 3904
},
{
"epoch": 0.3936,
"grad_norm": 0.8363422751426697,
"learning_rate": 9.074880561700074e-05,
"loss": 1.0672,
"step": 3936
},
{
"epoch": 0.3968,
"grad_norm": 1.2150477170944214,
"learning_rate": 9.06026485816128e-05,
"loss": 1.0645,
"step": 3968
},
{
"epoch": 0.4,
"grad_norm": 1.0260250568389893,
"learning_rate": 9.045546567426227e-05,
"loss": 1.0307,
"step": 4000
},
{
"epoch": 0.4032,
"grad_norm": 1.3611576557159424,
"learning_rate": 9.03072606136922e-05,
"loss": 1.1087,
"step": 4032
},
{
"epoch": 0.4064,
"grad_norm": 1.0070726871490479,
"learning_rate": 9.015803714447153e-05,
"loss": 1.0799,
"step": 4064
},
{
"epoch": 0.4096,
"grad_norm": 1.0184143781661987,
"learning_rate": 9.000779903690044e-05,
"loss": 1.0447,
"step": 4096
},
{
"epoch": 0.4128,
"grad_norm": 0.8251619935035706,
"learning_rate": 8.985655008691512e-05,
"loss": 1.0781,
"step": 4128
},
{
"epoch": 0.416,
"grad_norm": 1.1904375553131104,
"learning_rate": 8.970429411599177e-05,
"loss": 1.0679,
"step": 4160
},
{
"epoch": 0.4192,
"grad_norm": 1.1670352220535278,
"learning_rate": 8.955103497105021e-05,
"loss": 1.0098,
"step": 4192
},
{
"epoch": 0.4224,
"grad_norm": 1.018236517906189,
"learning_rate": 8.93967765243565e-05,
"loss": 1.0357,
"step": 4224
},
{
"epoch": 0.4256,
"grad_norm": 1.187759518623352,
"learning_rate": 8.924152267342529e-05,
"loss": 1.1212,
"step": 4256
},
{
"epoch": 0.4288,
"grad_norm": 0.9191340208053589,
"learning_rate": 8.908527734092114e-05,
"loss": 0.9963,
"step": 4288
},
{
"epoch": 0.432,
"grad_norm": 1.250663161277771,
"learning_rate": 8.893297291025703e-05,
"loss": 1.1243,
"step": 4320
},
{
"epoch": 0.4352,
"grad_norm": 1.1205859184265137,
"learning_rate": 8.877478715861173e-05,
"loss": 0.9712,
"step": 4352
},
{
"epoch": 0.4384,
"grad_norm": 1.024400234222412,
"learning_rate": 8.86156217179956e-05,
"loss": 1.0184,
"step": 4384
},
{
"epoch": 0.4416,
"grad_norm": 1.0629040002822876,
"learning_rate": 8.845548060990401e-05,
"loss": 1.0391,
"step": 4416
},
{
"epoch": 0.4448,
"grad_norm": 1.0474681854248047,
"learning_rate": 8.829436788048366e-05,
"loss": 1.1721,
"step": 4448
},
{
"epoch": 0.448,
"grad_norm": 1.2960838079452515,
"learning_rate": 8.813228760043037e-05,
"loss": 1.0247,
"step": 4480
},
{
"epoch": 0.4512,
"grad_norm": 1.1051262617111206,
"learning_rate": 8.796924386488624e-05,
"loss": 1.068,
"step": 4512
},
{
"epoch": 0.4544,
"grad_norm": 0.9894328713417053,
"learning_rate": 8.780524079333615e-05,
"loss": 0.9805,
"step": 4544
},
{
"epoch": 0.4576,
"grad_norm": 1.0095499753952026,
"learning_rate": 8.764028252950365e-05,
"loss": 0.9994,
"step": 4576
},
{
"epoch": 0.4608,
"grad_norm": 1.0299321413040161,
"learning_rate": 8.74743732412464e-05,
"loss": 1.0258,
"step": 4608
},
{
"epoch": 0.464,
"grad_norm": 1.11245858669281,
"learning_rate": 8.73075171204507e-05,
"loss": 1.0388,
"step": 4640
},
{
"epoch": 0.4672,
"grad_norm": 1.2084026336669922,
"learning_rate": 8.713971838292569e-05,
"loss": 1.1596,
"step": 4672
},
{
"epoch": 0.4704,
"grad_norm": 1.1535048484802246,
"learning_rate": 8.697098126829675e-05,
"loss": 1.0674,
"step": 4704
},
{
"epoch": 0.4736,
"grad_norm": 1.0976839065551758,
"learning_rate": 8.680131003989842e-05,
"loss": 1.1089,
"step": 4736
},
{
"epoch": 0.4768,
"grad_norm": 1.1108759641647339,
"learning_rate": 8.663070898466674e-05,
"loss": 1.0047,
"step": 4768
},
{
"epoch": 0.48,
"grad_norm": 0.9953986406326294,
"learning_rate": 8.645918241303084e-05,
"loss": 1.0991,
"step": 4800
},
{
"epoch": 0.4832,
"grad_norm": 1.0783305168151855,
"learning_rate": 8.628673465880404e-05,
"loss": 1.0449,
"step": 4832
},
{
"epoch": 0.4864,
"grad_norm": 1.0670068264007568,
"learning_rate": 8.611337007907448e-05,
"loss": 1.0002,
"step": 4864
},
{
"epoch": 0.4896,
"grad_norm": 1.4406965970993042,
"learning_rate": 8.59390930540948e-05,
"loss": 1.0825,
"step": 4896
},
{
"epoch": 0.4928,
"grad_norm": 2.000100612640381,
"learning_rate": 8.576390798717174e-05,
"loss": 1.0658,
"step": 4928
},
{
"epoch": 0.496,
"grad_norm": 1.1239198446273804,
"learning_rate": 8.558781930455464e-05,
"loss": 1.0066,
"step": 4960
},
{
"epoch": 0.4992,
"grad_norm": 0.965144157409668,
"learning_rate": 8.54108314553238e-05,
"loss": 1.0965,
"step": 4992
},
{
"epoch": 0.5024,
"grad_norm": 1.0297799110412598,
"learning_rate": 8.523294891127794e-05,
"loss": 1.0257,
"step": 5024
},
{
"epoch": 0.5056,
"grad_norm": 1.1478264331817627,
"learning_rate": 8.505417616682126e-05,
"loss": 1.0601,
"step": 5056
},
{
"epoch": 0.5088,
"grad_norm": 1.0132007598876953,
"learning_rate": 8.487451773884987e-05,
"loss": 1.0643,
"step": 5088
},
{
"epoch": 0.512,
"grad_norm": 1.5010863542556763,
"learning_rate": 8.469397816663773e-05,
"loss": 1.0577,
"step": 5120
},
{
"epoch": 0.5152,
"grad_norm": 1.0446892976760864,
"learning_rate": 8.451256201172186e-05,
"loss": 1.0305,
"step": 5152
},
{
"epoch": 0.5184,
"grad_norm": 1.0374213457107544,
"learning_rate": 8.433027385778716e-05,
"loss": 1.0254,
"step": 5184
},
{
"epoch": 0.5216,
"grad_norm": 0.958988606929779,
"learning_rate": 8.414711831055056e-05,
"loss": 1.0157,
"step": 5216
},
{
"epoch": 0.5248,
"grad_norm": 1.049494981765747,
"learning_rate": 8.396309999764467e-05,
"loss": 1.0241,
"step": 5248
},
{
"epoch": 0.528,
"grad_norm": 0.9103986620903015,
"learning_rate": 8.377822356850084e-05,
"loss": 1.0658,
"step": 5280
},
{
"epoch": 0.5312,
"grad_norm": 1.6454554796218872,
"learning_rate": 8.359249369423177e-05,
"loss": 1.0543,
"step": 5312
},
{
"epoch": 0.5344,
"grad_norm": 1.1632812023162842,
"learning_rate": 8.34059150675133e-05,
"loss": 1.0576,
"step": 5344
},
{
"epoch": 0.5376,
"grad_norm": 1.066264033317566,
"learning_rate": 8.321849240246608e-05,
"loss": 1.0488,
"step": 5376
},
{
"epoch": 0.5408,
"grad_norm": 0.9884083867073059,
"learning_rate": 8.303023043453624e-05,
"loss": 1.054,
"step": 5408
},
{
"epoch": 0.544,
"grad_norm": 1.1581878662109375,
"learning_rate": 8.284113392037593e-05,
"loss": 1.0847,
"step": 5440
},
{
"epoch": 0.5472,
"grad_norm": 1.0645771026611328,
"learning_rate": 8.265120763772303e-05,
"loss": 0.9862,
"step": 5472
},
{
"epoch": 0.5504,
"grad_norm": 1.2600454092025757,
"learning_rate": 8.246045638528047e-05,
"loss": 1.0295,
"step": 5504
},
{
"epoch": 0.5536,
"grad_norm": 1.2756901979446411,
"learning_rate": 8.226888498259496e-05,
"loss": 0.9753,
"step": 5536
},
{
"epoch": 0.5568,
"grad_norm": 1.0469154119491577,
"learning_rate": 8.207649826993522e-05,
"loss": 1.0993,
"step": 5568
},
{
"epoch": 0.56,
"grad_norm": 1.1633126735687256,
"learning_rate": 8.188330110816976e-05,
"loss": 0.9892,
"step": 5600
},
{
"epoch": 0.5632,
"grad_norm": 1.7112101316452026,
"learning_rate": 8.168929837864395e-05,
"loss": 0.9913,
"step": 5632
},
{
"epoch": 0.5664,
"grad_norm": 1.0041791200637817,
"learning_rate": 8.149449498305674e-05,
"loss": 1.0494,
"step": 5664
},
{
"epoch": 0.5696,
"grad_norm": 1.1538423299789429,
"learning_rate": 8.12988958433369e-05,
"loss": 1.0383,
"step": 5696
},
{
"epoch": 0.5728,
"grad_norm": 0.9828271865844727,
"learning_rate": 8.110250590151848e-05,
"loss": 1.1132,
"step": 5728
},
{
"epoch": 0.576,
"grad_norm": 1.243087649345398,
"learning_rate": 8.090533011961609e-05,
"loss": 1.008,
"step": 5760
},
{
"epoch": 0.5792,
"grad_norm": 1.0514239072799683,
"learning_rate": 8.070737347949947e-05,
"loss": 1.0286,
"step": 5792
},
{
"epoch": 0.5824,
"grad_norm": 1.0970929861068726,
"learning_rate": 8.050864098276762e-05,
"loss": 1.1212,
"step": 5824
},
{
"epoch": 0.5856,
"grad_norm": 1.0040539503097534,
"learning_rate": 8.030913765062245e-05,
"loss": 1.0395,
"step": 5856
},
{
"epoch": 0.5888,
"grad_norm": 0.8210061192512512,
"learning_rate": 8.010886852374191e-05,
"loss": 1.1159,
"step": 5888
},
{
"epoch": 0.592,
"grad_norm": 1.0136836767196655,
"learning_rate": 7.990783866215259e-05,
"loss": 1.0392,
"step": 5920
},
{
"epoch": 0.5952,
"grad_norm": 1.1107640266418457,
"learning_rate": 7.970605314510194e-05,
"loss": 1.0279,
"step": 5952
},
{
"epoch": 0.5984,
"grad_norm": 0.9535327553749084,
"learning_rate": 7.950351707092987e-05,
"loss": 1.0608,
"step": 5984
},
{
"epoch": 0.6016,
"grad_norm": 1.3050202131271362,
"learning_rate": 7.930023555693999e-05,
"loss": 1.0714,
"step": 6016
},
{
"epoch": 0.6048,
"grad_norm": 1.0925366878509521,
"learning_rate": 7.909621373927029e-05,
"loss": 0.9707,
"step": 6048
},
{
"epoch": 0.608,
"grad_norm": 0.9475853443145752,
"learning_rate": 7.88914567727634e-05,
"loss": 1.0056,
"step": 6080
},
{
"epoch": 0.6112,
"grad_norm": 1.2536673545837402,
"learning_rate": 7.868596983083623e-05,
"loss": 1.0983,
"step": 6112
},
{
"epoch": 0.6144,
"grad_norm": 1.1593080759048462,
"learning_rate": 7.847975810534943e-05,
"loss": 1.0214,
"step": 6144
},
{
"epoch": 0.6176,
"grad_norm": 1.4903924465179443,
"learning_rate": 7.82728268064761e-05,
"loss": 1.0825,
"step": 6176
},
{
"epoch": 0.6208,
"grad_norm": 1.31364905834198,
"learning_rate": 7.80651811625702e-05,
"loss": 1.0184,
"step": 6208
},
{
"epoch": 0.624,
"grad_norm": 1.1020359992980957,
"learning_rate": 7.785682642003437e-05,
"loss": 0.9785,
"step": 6240
},
{
"epoch": 0.6272,
"grad_norm": 1.680654525756836,
"learning_rate": 7.764776784318751e-05,
"loss": 1.0493,
"step": 6272
},
{
"epoch": 0.6304,
"grad_norm": 0.8548070192337036,
"learning_rate": 7.743801071413161e-05,
"loss": 1.0325,
"step": 6304
},
{
"epoch": 0.6336,
"grad_norm": 1.3193022012710571,
"learning_rate": 7.722756033261844e-05,
"loss": 1.0861,
"step": 6336
},
{
"epoch": 0.6368,
"grad_norm": 1.1262884140014648,
"learning_rate": 7.701642201591555e-05,
"loss": 0.9799,
"step": 6368
},
{
"epoch": 0.64,
"grad_norm": 1.0190273523330688,
"learning_rate": 7.680460109867194e-05,
"loss": 0.9806,
"step": 6400
},
{
"epoch": 0.6432,
"grad_norm": 0.9623986482620239,
"learning_rate": 7.659210293278334e-05,
"loss": 1.0146,
"step": 6432
},
{
"epoch": 0.6464,
"grad_norm": 0.8106020092964172,
"learning_rate": 7.637893288725688e-05,
"loss": 1.1549,
"step": 6464
},
{
"epoch": 0.6496,
"grad_norm": 1.0692909955978394,
"learning_rate": 7.616509634807549e-05,
"loss": 1.0515,
"step": 6496
},
{
"epoch": 0.6528,
"grad_norm": 0.7676146626472473,
"learning_rate": 7.595059871806187e-05,
"loss": 1.0496,
"step": 6528
},
{
"epoch": 0.656,
"grad_norm": 1.4028490781784058,
"learning_rate": 7.574217882816324e-05,
"loss": 1.1564,
"step": 6560
},
{
"epoch": 0.6592,
"grad_norm": 2.0384438037872314,
"learning_rate": 7.552639552903132e-05,
"loss": 0.9668,
"step": 6592
},
{
"epoch": 0.6624,
"grad_norm": 1.113044261932373,
"learning_rate": 7.53099672765677e-05,
"loss": 1.0345,
"step": 6624
},
{
"epoch": 0.6656,
"grad_norm": 1.3547977209091187,
"learning_rate": 7.509289953907758e-05,
"loss": 1.0719,
"step": 6656
},
{
"epoch": 0.6688,
"grad_norm": 0.9287874102592468,
"learning_rate": 7.487519780102354e-05,
"loss": 1.0301,
"step": 6688
},
{
"epoch": 0.672,
"grad_norm": 1.3750686645507812,
"learning_rate": 7.46568675628869e-05,
"loss": 1.0542,
"step": 6720
},
{
"epoch": 0.6752,
"grad_norm": 0.5963271260261536,
"learning_rate": 7.443791434102868e-05,
"loss": 0.9945,
"step": 6752
},
{
"epoch": 0.6784,
"grad_norm": 1.117193341255188,
"learning_rate": 7.421834366755039e-05,
"loss": 1.0214,
"step": 6784
},
{
"epoch": 0.6816,
"grad_norm": 1.096929907798767,
"learning_rate": 7.399816109015407e-05,
"loss": 1.0439,
"step": 6816
},
{
"epoch": 0.6848,
"grad_norm": 1.0610090494155884,
"learning_rate": 7.377737217200226e-05,
"loss": 1.041,
"step": 6848
},
{
"epoch": 0.688,
"grad_norm": 0.9771848320960999,
"learning_rate": 7.355598249157734e-05,
"loss": 1.1224,
"step": 6880
},
{
"epoch": 0.6912,
"grad_norm": 1.0380698442459106,
"learning_rate": 7.333399764254068e-05,
"loss": 1.0475,
"step": 6912
},
{
"epoch": 0.6944,
"grad_norm": 1.1135938167572021,
"learning_rate": 7.311142323359121e-05,
"loss": 0.9665,
"step": 6944
},
{
"epoch": 0.6976,
"grad_norm": 0.9427506327629089,
"learning_rate": 7.288826488832384e-05,
"loss": 1.0845,
"step": 6976
},
{
"epoch": 0.7008,
"grad_norm": 1.020609736442566,
"learning_rate": 7.266452824508719e-05,
"loss": 1.0806,
"step": 7008
},
{
"epoch": 0.704,
"grad_norm": 1.3327020406723022,
"learning_rate": 7.244021895684131e-05,
"loss": 1.0456,
"step": 7040
},
{
"epoch": 0.7072,
"grad_norm": 0.9490824937820435,
"learning_rate": 7.221534269101474e-05,
"loss": 1.0546,
"step": 7072
},
{
"epoch": 0.7104,
"grad_norm": 1.043341875076294,
"learning_rate": 7.198990512936135e-05,
"loss": 0.9643,
"step": 7104
},
{
"epoch": 0.7136,
"grad_norm": 1.0628856420516968,
"learning_rate": 7.17639119678168e-05,
"loss": 1.0433,
"step": 7136
},
{
"epoch": 0.7168,
"grad_norm": 0.8244098424911499,
"learning_rate": 7.153736891635463e-05,
"loss": 1.0359,
"step": 7168
},
{
"epoch": 0.72,
"grad_norm": 1.1554003953933716,
"learning_rate": 7.131028169884194e-05,
"loss": 1.0216,
"step": 7200
},
{
"epoch": 0.7232,
"grad_norm": 1.1582995653152466,
"learning_rate": 7.108265605289481e-05,
"loss": 0.9845,
"step": 7232
},
{
"epoch": 0.7264,
"grad_norm": 1.1655360460281372,
"learning_rate": 7.085449772973333e-05,
"loss": 1.0771,
"step": 7264
},
{
"epoch": 0.7296,
"grad_norm": 1.28196382522583,
"learning_rate": 7.062581249403627e-05,
"loss": 1.0186,
"step": 7296
},
{
"epoch": 0.7328,
"grad_norm": 1.149167537689209,
"learning_rate": 7.039660612379546e-05,
"loss": 0.9905,
"step": 7328
},
{
"epoch": 0.736,
"grad_norm": 1.0396078824996948,
"learning_rate": 7.016688441016979e-05,
"loss": 1.0196,
"step": 7360
},
{
"epoch": 0.7392,
"grad_norm": 0.8532673716545105,
"learning_rate": 6.993665315733889e-05,
"loss": 1.0197,
"step": 7392
},
{
"epoch": 0.7424,
"grad_norm": 1.03330659866333,
"learning_rate": 6.970591818235641e-05,
"loss": 1.0163,
"step": 7424
},
{
"epoch": 0.7456,
"grad_norm": 1.5266470909118652,
"learning_rate": 6.947468531500321e-05,
"loss": 1.0247,
"step": 7456
},
{
"epoch": 0.7488,
"grad_norm": 1.127951979637146,
"learning_rate": 6.924296039763987e-05,
"loss": 0.9851,
"step": 7488
},
{
"epoch": 0.752,
"grad_norm": 1.0132697820663452,
"learning_rate": 6.901074928505928e-05,
"loss": 1.0015,
"step": 7520
},
{
"epoch": 0.7552,
"grad_norm": 1.034342646598816,
"learning_rate": 6.877805784433852e-05,
"loss": 0.978,
"step": 7552
},
{
"epoch": 0.7584,
"grad_norm": 0.9696159958839417,
"learning_rate": 6.854489195469069e-05,
"loss": 1.129,
"step": 7584
},
{
"epoch": 0.7616,
"grad_norm": 1.056174874305725,
"learning_rate": 6.831125750731646e-05,
"loss": 1.0418,
"step": 7616
},
{
"epoch": 0.7648,
"grad_norm": 0.9070044755935669,
"learning_rate": 6.80771604052551e-05,
"loss": 1.0073,
"step": 7648
},
{
"epoch": 0.768,
"grad_norm": 1.1860136985778809,
"learning_rate": 6.784260656323533e-05,
"loss": 1.0599,
"step": 7680
},
{
"epoch": 0.7712,
"grad_norm": 1.0756847858428955,
"learning_rate": 6.760760190752604e-05,
"loss": 1.0392,
"step": 7712
},
{
"epoch": 0.7744,
"grad_norm": 1.247762680053711,
"learning_rate": 6.737215237578631e-05,
"loss": 1.0265,
"step": 7744
},
{
"epoch": 0.7776,
"grad_norm": 1.0265753269195557,
"learning_rate": 6.71362639169156e-05,
"loss": 1.031,
"step": 7776
},
{
"epoch": 0.7808,
"grad_norm": 1.0263795852661133,
"learning_rate": 6.689994249090333e-05,
"loss": 0.9527,
"step": 7808
},
{
"epoch": 0.784,
"grad_norm": 1.2893991470336914,
"learning_rate": 6.666319406867833e-05,
"loss": 1.1626,
"step": 7840
},
{
"epoch": 0.7872,
"grad_norm": 0.958138644695282,
"learning_rate": 6.642602463195799e-05,
"loss": 1.1133,
"step": 7872
},
{
"epoch": 0.7904,
"grad_norm": 1.257802128791809,
"learning_rate": 6.618844017309708e-05,
"loss": 1.0102,
"step": 7904
},
{
"epoch": 0.7936,
"grad_norm": 1.1419870853424072,
"learning_rate": 6.59504466949364e-05,
"loss": 1.0997,
"step": 7936
},
{
"epoch": 0.7968,
"grad_norm": 0.9523638486862183,
"learning_rate": 6.571205021065108e-05,
"loss": 1.0273,
"step": 7968
},
{
"epoch": 0.8,
"grad_norm": 1.1219632625579834,
"learning_rate": 6.547325674359865e-05,
"loss": 1.123,
"step": 8000
},
{
"epoch": 0.8032,
"grad_norm": 1.3210878372192383,
"learning_rate": 6.523407232716684e-05,
"loss": 0.9976,
"step": 8032
},
{
"epoch": 0.8064,
"grad_norm": 1.176743984222412,
"learning_rate": 6.499450300462121e-05,
"loss": 1.0448,
"step": 8064
},
{
"epoch": 0.8096,
"grad_norm": 1.2411494255065918,
"learning_rate": 6.475455482895238e-05,
"loss": 1.0001,
"step": 8096
},
{
"epoch": 0.8128,
"grad_norm": 1.0539944171905518,
"learning_rate": 6.451423386272312e-05,
"loss": 1.122,
"step": 8128
},
{
"epoch": 0.816,
"grad_norm": 2.260613203048706,
"learning_rate": 6.427354617791519e-05,
"loss": 1.005,
"step": 8160
},
{
"epoch": 0.8192,
"grad_norm": 1.2137510776519775,
"learning_rate": 6.403249785577589e-05,
"loss": 0.9567,
"step": 8192
},
{
"epoch": 0.8224,
"grad_norm": 1.1636831760406494,
"learning_rate": 6.379109498666445e-05,
"loss": 1.0428,
"step": 8224
},
{
"epoch": 0.8256,
"grad_norm": 1.1331391334533691,
"learning_rate": 6.354934366989812e-05,
"loss": 1.0609,
"step": 8256
},
{
"epoch": 0.8288,
"grad_norm": 1.5374737977981567,
"learning_rate": 6.330725001359809e-05,
"loss": 1.0728,
"step": 8288
},
{
"epoch": 0.832,
"grad_norm": 1.287787675857544,
"learning_rate": 6.306482013453515e-05,
"loss": 1.0416,
"step": 8320
},
{
"epoch": 0.8352,
"grad_norm": 1.130149006843567,
"learning_rate": 6.28220601579751e-05,
"loss": 1.0513,
"step": 8352
},
{
"epoch": 0.8384,
"grad_norm": 0.9294027090072632,
"learning_rate": 6.257897621752405e-05,
"loss": 1.0551,
"step": 8384
},
{
"epoch": 0.8416,
"grad_norm": 0.8640485405921936,
"learning_rate": 6.233557445497345e-05,
"loss": 1.0518,
"step": 8416
},
{
"epoch": 0.8448,
"grad_norm": 1.1084208488464355,
"learning_rate": 6.209186102014486e-05,
"loss": 1.0359,
"step": 8448
},
{
"epoch": 0.848,
"grad_norm": 0.8203976154327393,
"learning_rate": 6.18478420707346e-05,
"loss": 0.9709,
"step": 8480
},
{
"epoch": 0.8512,
"grad_norm": 1.507534384727478,
"learning_rate": 6.160352377215816e-05,
"loss": 0.9479,
"step": 8512
},
{
"epoch": 0.8544,
"grad_norm": 0.8405012488365173,
"learning_rate": 6.135891229739444e-05,
"loss": 1.025,
"step": 8544
},
{
"epoch": 0.8576,
"grad_norm": 0.9983368515968323,
"learning_rate": 6.111401382682972e-05,
"loss": 1.1023,
"step": 8576
},
{
"epoch": 0.8608,
"grad_norm": 1.079447865486145,
"learning_rate": 6.086883454810162e-05,
"loss": 0.9684,
"step": 8608
},
{
"epoch": 0.864,
"grad_norm": 0.963784396648407,
"learning_rate": 6.06310551852323e-05,
"loss": 1.0703,
"step": 8640
},
{
"epoch": 0.8672,
"grad_norm": 1.44817316532135,
"learning_rate": 6.0385341175240205e-05,
"loss": 1.0276,
"step": 8672
},
{
"epoch": 0.8704,
"grad_norm": 1.0072044134140015,
"learning_rate": 6.0139364767825626e-05,
"loss": 1.0744,
"step": 8704
},
{
"epoch": 0.8736,
"grad_norm": 1.328588604927063,
"learning_rate": 5.9893132177861454e-05,
"loss": 1.0823,
"step": 8736
},
{
"epoch": 0.8768,
"grad_norm": 1.323585867881775,
"learning_rate": 5.964664962669333e-05,
"loss": 1.0011,
"step": 8768
},
{
"epoch": 0.88,
"grad_norm": 1.4633543491363525,
"learning_rate": 5.939992334198242e-05,
"loss": 0.9919,
"step": 8800
},
{
"epoch": 0.8832,
"grad_norm": 1.0282506942749023,
"learning_rate": 5.9152959557548117e-05,
"loss": 1.0215,
"step": 8832
},
{
"epoch": 0.8864,
"grad_norm": 0.8649700284004211,
"learning_rate": 5.89057645132105e-05,
"loss": 1.0628,
"step": 8864
},
{
"epoch": 0.8896,
"grad_norm": 0.9102625846862793,
"learning_rate": 5.865834445463273e-05,
"loss": 0.9597,
"step": 8896
},
{
"epoch": 0.8928,
"grad_norm": 1.0294193029403687,
"learning_rate": 5.841070563316315e-05,
"loss": 1.0335,
"step": 8928
},
{
"epoch": 0.896,
"grad_norm": 1.122887372970581,
"learning_rate": 5.8162854305677425e-05,
"loss": 1.0743,
"step": 8960
},
{
"epoch": 0.8992,
"grad_norm": 1.419608235359192,
"learning_rate": 5.791479673442044e-05,
"loss": 1.0136,
"step": 8992
},
{
"epoch": 0.9024,
"grad_norm": 1.0360965728759766,
"learning_rate": 5.7666539186848036e-05,
"loss": 1.0314,
"step": 9024
},
{
"epoch": 0.9056,
"grad_norm": 1.2409007549285889,
"learning_rate": 5.74180879354687e-05,
"loss": 0.903,
"step": 9056
},
{
"epoch": 0.9088,
"grad_norm": 1.0799171924591064,
"learning_rate": 5.716944925768505e-05,
"loss": 1.0727,
"step": 9088
},
{
"epoch": 0.912,
"grad_norm": 1.0068849325180054,
"learning_rate": 5.6920629435635256e-05,
"loss": 0.9064,
"step": 9120
},
{
"epoch": 0.9152,
"grad_norm": 0.9761477708816528,
"learning_rate": 5.6671634756034295e-05,
"loss": 0.9928,
"step": 9152
},
{
"epoch": 0.9184,
"grad_norm": 1.3264461755752563,
"learning_rate": 5.642247151001515e-05,
"loss": 1.0678,
"step": 9184
},
{
"epoch": 0.9216,
"grad_norm": 1.6155526638031006,
"learning_rate": 5.617314599296977e-05,
"loss": 1.0057,
"step": 9216
},
{
"epoch": 0.9248,
"grad_norm": 0.985884428024292,
"learning_rate": 5.592366450439012e-05,
"loss": 1.0783,
"step": 9248
},
{
"epoch": 0.928,
"grad_norm": 1.1194316148757935,
"learning_rate": 5.567403334770891e-05,
"loss": 1.086,
"step": 9280
},
{
"epoch": 0.9312,
"grad_norm": 0.9581426978111267,
"learning_rate": 5.542425883014043e-05,
"loss": 1.0819,
"step": 9312
},
{
"epoch": 0.9344,
"grad_norm": 1.0260018110275269,
"learning_rate": 5.517434726252113e-05,
"loss": 1.0206,
"step": 9344
},
{
"epoch": 0.9376,
"grad_norm": 1.3467062711715698,
"learning_rate": 5.4924304959150175e-05,
"loss": 1.0682,
"step": 9376
},
{
"epoch": 0.9408,
"grad_norm": 1.030444622039795,
"learning_rate": 5.467413823762993e-05,
"loss": 1.0894,
"step": 9408
},
{
"epoch": 0.944,
"grad_norm": 1.1066439151763916,
"learning_rate": 5.4423853418706327e-05,
"loss": 0.938,
"step": 9440
},
{
"epoch": 0.9472,
"grad_norm": 1.088860034942627,
"learning_rate": 5.417345682610914e-05,
"loss": 1.0293,
"step": 9472
},
{
"epoch": 0.9504,
"grad_norm": 1.4524608850479126,
"learning_rate": 5.392295478639225e-05,
"loss": 1.0259,
"step": 9504
},
{
"epoch": 0.9536,
"grad_norm": 1.0502616167068481,
"learning_rate": 5.367235362877378e-05,
"loss": 0.9685,
"step": 9536
},
{
"epoch": 0.9568,
"grad_norm": 1.1287665367126465,
"learning_rate": 5.3421659684976197e-05,
"loss": 1.0295,
"step": 9568
},
{
"epoch": 0.96,
"grad_norm": 1.4596409797668457,
"learning_rate": 5.317087928906627e-05,
"loss": 1.0235,
"step": 9600
},
{
"epoch": 0.9632,
"grad_norm": 1.3627421855926514,
"learning_rate": 5.29200187772951e-05,
"loss": 1.126,
"step": 9632
},
{
"epoch": 0.9664,
"grad_norm": 1.2144567966461182,
"learning_rate": 5.266908448793803e-05,
"loss": 0.9882,
"step": 9664
},
{
"epoch": 0.9696,
"grad_norm": 1.453833818435669,
"learning_rate": 5.2418082761134445e-05,
"loss": 1.0644,
"step": 9696
},
{
"epoch": 0.9728,
"grad_norm": 1.1099966764450073,
"learning_rate": 5.216701993872762e-05,
"loss": 0.974,
"step": 9728
},
{
"epoch": 0.976,
"grad_norm": 0.8567425012588501,
"learning_rate": 5.1915902364104506e-05,
"loss": 1.0689,
"step": 9760
},
{
"epoch": 0.9792,
"grad_norm": 1.1577990055084229,
"learning_rate": 5.166473638203539e-05,
"loss": 1.0094,
"step": 9792
},
{
"epoch": 0.9824,
"grad_norm": 0.8881478905677795,
"learning_rate": 5.141352833851367e-05,
"loss": 1.0945,
"step": 9824
},
{
"epoch": 0.9856,
"grad_norm": 0.8964444994926453,
"learning_rate": 5.116228458059543e-05,
"loss": 1.0251,
"step": 9856
},
{
"epoch": 0.9888,
"grad_norm": 1.2837964296340942,
"learning_rate": 5.0911011456239157e-05,
"loss": 1.1041,
"step": 9888
},
{
"epoch": 0.992,
"grad_norm": 1.0828759670257568,
"learning_rate": 5.065971531414528e-05,
"loss": 1.0765,
"step": 9920
},
{
"epoch": 0.9952,
"grad_norm": 1.0157177448272705,
"learning_rate": 5.0408402503595845e-05,
"loss": 1.0109,
"step": 9952
},
{
"epoch": 0.9984,
"grad_norm": 1.128143310546875,
"learning_rate": 5.0157079374293983e-05,
"loss": 1.0521,
"step": 9984
}
],
"logging_steps": 32,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.411696496839885e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}