lesso02's picture
Training in progress, step 141, checkpoint
497893d verified
{
"best_metric": 0.9274308681488037,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 3.0,
"eval_steps": 50,
"global_step": 141,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02127659574468085,
"grad_norm": 13.848388671875,
"learning_rate": 1.02e-05,
"loss": 4.4616,
"step": 1
},
{
"epoch": 0.02127659574468085,
"eval_loss": 3.8836147785186768,
"eval_runtime": 3.5795,
"eval_samples_per_second": 176.841,
"eval_steps_per_second": 5.587,
"step": 1
},
{
"epoch": 0.0425531914893617,
"grad_norm": 6.909480094909668,
"learning_rate": 2.04e-05,
"loss": 3.8433,
"step": 2
},
{
"epoch": 0.06382978723404255,
"grad_norm": 5.8276777267456055,
"learning_rate": 3.06e-05,
"loss": 3.5629,
"step": 3
},
{
"epoch": 0.0851063829787234,
"grad_norm": 5.752169132232666,
"learning_rate": 4.08e-05,
"loss": 3.5849,
"step": 4
},
{
"epoch": 0.10638297872340426,
"grad_norm": 6.22681999206543,
"learning_rate": 5.1e-05,
"loss": 3.1647,
"step": 5
},
{
"epoch": 0.1276595744680851,
"grad_norm": 9.792737007141113,
"learning_rate": 6.12e-05,
"loss": 2.7336,
"step": 6
},
{
"epoch": 0.14893617021276595,
"grad_norm": 8.102272033691406,
"learning_rate": 7.14e-05,
"loss": 2.2142,
"step": 7
},
{
"epoch": 0.1702127659574468,
"grad_norm": 5.091486930847168,
"learning_rate": 8.16e-05,
"loss": 1.9189,
"step": 8
},
{
"epoch": 0.19148936170212766,
"grad_norm": 2.87353515625,
"learning_rate": 9.18e-05,
"loss": 1.9533,
"step": 9
},
{
"epoch": 0.2127659574468085,
"grad_norm": 2.3648200035095215,
"learning_rate": 0.000102,
"loss": 1.5369,
"step": 10
},
{
"epoch": 0.23404255319148937,
"grad_norm": 1.6724998950958252,
"learning_rate": 0.00010198533518731099,
"loss": 1.4629,
"step": 11
},
{
"epoch": 0.2553191489361702,
"grad_norm": 1.2147119045257568,
"learning_rate": 0.0001019413491828413,
"loss": 1.1376,
"step": 12
},
{
"epoch": 0.2765957446808511,
"grad_norm": 7.048776626586914,
"learning_rate": 0.00010186806728253272,
"loss": 0.8492,
"step": 13
},
{
"epoch": 0.2978723404255319,
"grad_norm": 5.654258728027344,
"learning_rate": 0.00010176553163012415,
"loss": 0.5646,
"step": 14
},
{
"epoch": 0.3191489361702128,
"grad_norm": 4.756352424621582,
"learning_rate": 0.00010163380119291505,
"loss": 1.6847,
"step": 15
},
{
"epoch": 0.3404255319148936,
"grad_norm": 1.8449368476867676,
"learning_rate": 0.00010147295172785395,
"loss": 1.4681,
"step": 16
},
{
"epoch": 0.3617021276595745,
"grad_norm": 0.6776601672172546,
"learning_rate": 0.00010128307573797129,
"loss": 1.244,
"step": 17
},
{
"epoch": 0.3829787234042553,
"grad_norm": 0.5467591285705566,
"learning_rate": 0.00010106428241918177,
"loss": 1.1554,
"step": 18
},
{
"epoch": 0.40425531914893614,
"grad_norm": 0.8386530876159668,
"learning_rate": 0.00010081669759748692,
"loss": 0.7783,
"step": 19
},
{
"epoch": 0.425531914893617,
"grad_norm": 0.9994120597839355,
"learning_rate": 0.00010054046365661356,
"loss": 0.1474,
"step": 20
},
{
"epoch": 0.44680851063829785,
"grad_norm": 0.9148034453392029,
"learning_rate": 0.00010023573945613038,
"loss": 1.3979,
"step": 21
},
{
"epoch": 0.46808510638297873,
"grad_norm": 0.6885150074958801,
"learning_rate": 9.99027002400892e-05,
"loss": 1.4452,
"step": 22
},
{
"epoch": 0.48936170212765956,
"grad_norm": 0.5044928193092346,
"learning_rate": 9.954153753624383e-05,
"loss": 1.2974,
"step": 23
},
{
"epoch": 0.5106382978723404,
"grad_norm": 0.517201840877533,
"learning_rate": 9.915245904590414e-05,
"loss": 1.2173,
"step": 24
},
{
"epoch": 0.5319148936170213,
"grad_norm": 0.7127615809440613,
"learning_rate": 9.873568852448903e-05,
"loss": 1.0097,
"step": 25
},
{
"epoch": 0.5531914893617021,
"grad_norm": 1.4452153444290161,
"learning_rate": 9.829146565284679e-05,
"loss": 0.0585,
"step": 26
},
{
"epoch": 0.574468085106383,
"grad_norm": 0.6424712538719177,
"learning_rate": 9.782004589941682e-05,
"loss": 1.1097,
"step": 27
},
{
"epoch": 0.5957446808510638,
"grad_norm": 0.4880043864250183,
"learning_rate": 9.732170037331209e-05,
"loss": 1.4546,
"step": 28
},
{
"epoch": 0.6170212765957447,
"grad_norm": 0.4043918251991272,
"learning_rate": 9.679671566840698e-05,
"loss": 1.2941,
"step": 29
},
{
"epoch": 0.6382978723404256,
"grad_norm": 0.4223072826862335,
"learning_rate": 9.624539369851954e-05,
"loss": 1.14,
"step": 30
},
{
"epoch": 0.6595744680851063,
"grad_norm": 0.5029892325401306,
"learning_rate": 9.566805152378394e-05,
"loss": 0.9826,
"step": 31
},
{
"epoch": 0.6808510638297872,
"grad_norm": 0.5004958510398865,
"learning_rate": 9.50650211683119e-05,
"loss": 0.3189,
"step": 32
},
{
"epoch": 0.7021276595744681,
"grad_norm": 0.46429964900016785,
"learning_rate": 9.443664942924885e-05,
"loss": 0.9003,
"step": 33
},
{
"epoch": 0.723404255319149,
"grad_norm": 0.5286682844161987,
"learning_rate": 9.378329767733415e-05,
"loss": 1.4447,
"step": 34
},
{
"epoch": 0.7446808510638298,
"grad_norm": 0.4269276559352875,
"learning_rate": 9.310534164908e-05,
"loss": 1.3137,
"step": 35
},
{
"epoch": 0.7659574468085106,
"grad_norm": 0.370991587638855,
"learning_rate": 9.240317123068899e-05,
"loss": 1.1297,
"step": 36
},
{
"epoch": 0.7872340425531915,
"grad_norm": 0.4747011065483093,
"learning_rate": 9.167719023383408e-05,
"loss": 1.0179,
"step": 37
},
{
"epoch": 0.8085106382978723,
"grad_norm": 0.4905516803264618,
"learning_rate": 9.09278161634304e-05,
"loss": 0.4583,
"step": 38
},
{
"epoch": 0.8297872340425532,
"grad_norm": 0.43988707661628723,
"learning_rate": 9.015547997753193e-05,
"loss": 0.6616,
"step": 39
},
{
"epoch": 0.851063829787234,
"grad_norm": 0.45796748995780945,
"learning_rate": 8.936062583949154e-05,
"loss": 1.4275,
"step": 40
},
{
"epoch": 0.8723404255319149,
"grad_norm": 0.36554864048957825,
"learning_rate": 8.854371086252688e-05,
"loss": 1.2779,
"step": 41
},
{
"epoch": 0.8936170212765957,
"grad_norm": 0.3677642345428467,
"learning_rate": 8.770520484683873e-05,
"loss": 1.1917,
"step": 42
},
{
"epoch": 0.9148936170212766,
"grad_norm": 0.35996830463409424,
"learning_rate": 8.68455900094333e-05,
"loss": 1.0847,
"step": 43
},
{
"epoch": 0.9361702127659575,
"grad_norm": 0.4477192163467407,
"learning_rate": 8.596536070680378e-05,
"loss": 0.7112,
"step": 44
},
{
"epoch": 0.9574468085106383,
"grad_norm": 0.4123137891292572,
"learning_rate": 8.506502315063037e-05,
"loss": 0.9964,
"step": 45
},
{
"epoch": 0.9787234042553191,
"grad_norm": 0.358024001121521,
"learning_rate": 8.414509511666283e-05,
"loss": 1.2649,
"step": 46
},
{
"epoch": 1.0,
"grad_norm": 0.37631550431251526,
"learning_rate": 8.320610564695234e-05,
"loss": 0.9995,
"step": 47
},
{
"epoch": 1.0212765957446808,
"grad_norm": 0.22452251613140106,
"learning_rate": 8.224859474560443e-05,
"loss": 0.1898,
"step": 48
},
{
"epoch": 1.0425531914893618,
"grad_norm": 0.4229590594768524,
"learning_rate": 8.127311306822753e-05,
"loss": 1.1368,
"step": 49
},
{
"epoch": 1.0638297872340425,
"grad_norm": 0.37615370750427246,
"learning_rate": 8.028022160525618e-05,
"loss": 1.3321,
"step": 50
},
{
"epoch": 1.0638297872340425,
"eval_loss": 0.9616568088531494,
"eval_runtime": 3.2062,
"eval_samples_per_second": 197.427,
"eval_steps_per_second": 6.238,
"step": 50
},
{
"epoch": 1.0851063829787233,
"grad_norm": 0.32277727127075195,
"learning_rate": 7.927049135933059e-05,
"loss": 1.1556,
"step": 51
},
{
"epoch": 1.1063829787234043,
"grad_norm": 0.4011160135269165,
"learning_rate": 7.82445030169183e-05,
"loss": 1.0646,
"step": 52
},
{
"epoch": 1.127659574468085,
"grad_norm": 0.5565645098686218,
"learning_rate": 7.720284661436687e-05,
"loss": 0.8884,
"step": 53
},
{
"epoch": 1.148936170212766,
"grad_norm": 0.30222636461257935,
"learning_rate": 7.614612119857942e-05,
"loss": 0.2506,
"step": 54
},
{
"epoch": 1.1702127659574468,
"grad_norm": 0.3559470772743225,
"learning_rate": 7.507493448250836e-05,
"loss": 0.722,
"step": 55
},
{
"epoch": 1.1914893617021276,
"grad_norm": 0.4607730805873871,
"learning_rate": 7.398990249566532e-05,
"loss": 1.3531,
"step": 56
},
{
"epoch": 1.2127659574468086,
"grad_norm": 0.39202919602394104,
"learning_rate": 7.289164922984824e-05,
"loss": 1.2476,
"step": 57
},
{
"epoch": 1.2340425531914894,
"grad_norm": 0.35366523265838623,
"learning_rate": 7.178080628028965e-05,
"loss": 1.1099,
"step": 58
},
{
"epoch": 1.2553191489361701,
"grad_norm": 0.40545791387557983,
"learning_rate": 7.065801248243196e-05,
"loss": 0.9506,
"step": 59
},
{
"epoch": 1.2765957446808511,
"grad_norm": 0.3662566542625427,
"learning_rate": 6.952391354453924e-05,
"loss": 0.4258,
"step": 60
},
{
"epoch": 1.297872340425532,
"grad_norm": 0.28656280040740967,
"learning_rate": 6.837916167635644e-05,
"loss": 0.4806,
"step": 61
},
{
"epoch": 1.3191489361702127,
"grad_norm": 0.47951042652130127,
"learning_rate": 6.722441521402946e-05,
"loss": 1.3409,
"step": 62
},
{
"epoch": 1.3404255319148937,
"grad_norm": 0.4036313593387604,
"learning_rate": 6.606033824150241e-05,
"loss": 1.272,
"step": 63
},
{
"epoch": 1.3617021276595744,
"grad_norm": 0.372051477432251,
"learning_rate": 6.48876002086089e-05,
"loss": 1.0842,
"step": 64
},
{
"epoch": 1.3829787234042552,
"grad_norm": 0.4357682466506958,
"learning_rate": 6.37068755460778e-05,
"loss": 1.0105,
"step": 65
},
{
"epoch": 1.4042553191489362,
"grad_norm": 0.5092247128486633,
"learning_rate": 6.251884327767429e-05,
"loss": 0.6371,
"step": 66
},
{
"epoch": 1.425531914893617,
"grad_norm": 0.202036052942276,
"learning_rate": 6.132418662969977e-05,
"loss": 0.2384,
"step": 67
},
{
"epoch": 1.4468085106382977,
"grad_norm": 0.4590073227882385,
"learning_rate": 6.012359263807463e-05,
"loss": 1.2431,
"step": 68
},
{
"epoch": 1.4680851063829787,
"grad_norm": 0.395398885011673,
"learning_rate": 5.891775175323035e-05,
"loss": 1.2642,
"step": 69
},
{
"epoch": 1.4893617021276595,
"grad_norm": 0.4025956690311432,
"learning_rate": 5.770735744303787e-05,
"loss": 1.1548,
"step": 70
},
{
"epoch": 1.5106382978723403,
"grad_norm": 0.402270644903183,
"learning_rate": 5.6493105794000665e-05,
"loss": 1.0174,
"step": 71
},
{
"epoch": 1.5319148936170213,
"grad_norm": 0.5136646032333374,
"learning_rate": 5.52756951109419e-05,
"loss": 0.8706,
"step": 72
},
{
"epoch": 1.5531914893617023,
"grad_norm": 0.0806485190987587,
"learning_rate": 5.405582551541579e-05,
"loss": 0.0626,
"step": 73
},
{
"epoch": 1.574468085106383,
"grad_norm": 0.42770785093307495,
"learning_rate": 5.283419854307425e-05,
"loss": 1.0383,
"step": 74
},
{
"epoch": 1.5957446808510638,
"grad_norm": 0.4656476080417633,
"learning_rate": 5.16115167402202e-05,
"loss": 1.293,
"step": 75
},
{
"epoch": 1.6170212765957448,
"grad_norm": 0.38593193888664246,
"learning_rate": 5.0388483259779815e-05,
"loss": 1.2081,
"step": 76
},
{
"epoch": 1.6382978723404256,
"grad_norm": 0.3994680643081665,
"learning_rate": 4.916580145692577e-05,
"loss": 1.0481,
"step": 77
},
{
"epoch": 1.6595744680851063,
"grad_norm": 0.506732702255249,
"learning_rate": 4.794417448458422e-05,
"loss": 0.8817,
"step": 78
},
{
"epoch": 1.6808510638297873,
"grad_norm": 0.272098571062088,
"learning_rate": 4.67243048890581e-05,
"loss": 0.2088,
"step": 79
},
{
"epoch": 1.702127659574468,
"grad_norm": 0.35723525285720825,
"learning_rate": 4.5506894205999334e-05,
"loss": 0.6819,
"step": 80
},
{
"epoch": 1.7234042553191489,
"grad_norm": 0.4605822265148163,
"learning_rate": 4.429264255696214e-05,
"loss": 1.3524,
"step": 81
},
{
"epoch": 1.7446808510638299,
"grad_norm": 0.38222458958625793,
"learning_rate": 4.308224824676965e-05,
"loss": 1.1625,
"step": 82
},
{
"epoch": 1.7659574468085106,
"grad_norm": 0.3701620399951935,
"learning_rate": 4.187640736192537e-05,
"loss": 1.064,
"step": 83
},
{
"epoch": 1.7872340425531914,
"grad_norm": 0.4499792456626892,
"learning_rate": 4.067581337030022e-05,
"loss": 0.9158,
"step": 84
},
{
"epoch": 1.8085106382978724,
"grad_norm": 0.3936365842819214,
"learning_rate": 3.948115672232572e-05,
"loss": 0.4121,
"step": 85
},
{
"epoch": 1.8297872340425532,
"grad_norm": 0.27606216073036194,
"learning_rate": 3.8293124453922226e-05,
"loss": 0.4227,
"step": 86
},
{
"epoch": 1.851063829787234,
"grad_norm": 0.48591378331184387,
"learning_rate": 3.711239979139111e-05,
"loss": 1.3174,
"step": 87
},
{
"epoch": 1.872340425531915,
"grad_norm": 0.3974682092666626,
"learning_rate": 3.593966175849759e-05,
"loss": 1.2122,
"step": 88
},
{
"epoch": 1.8936170212765957,
"grad_norm": 0.3816875219345093,
"learning_rate": 3.477558478597054e-05,
"loss": 1.1615,
"step": 89
},
{
"epoch": 1.9148936170212765,
"grad_norm": 0.4059462547302246,
"learning_rate": 3.362083832364357e-05,
"loss": 0.9824,
"step": 90
},
{
"epoch": 1.9361702127659575,
"grad_norm": 0.4912261366844177,
"learning_rate": 3.247608645546074e-05,
"loss": 0.683,
"step": 91
},
{
"epoch": 1.9574468085106385,
"grad_norm": 0.4084428548812866,
"learning_rate": 3.134198751756804e-05,
"loss": 1.0533,
"step": 92
},
{
"epoch": 1.978723404255319,
"grad_norm": 0.38703203201293945,
"learning_rate": 3.0219193719710368e-05,
"loss": 1.1633,
"step": 93
},
{
"epoch": 2.0,
"grad_norm": 0.4213656783103943,
"learning_rate": 2.910835077015177e-05,
"loss": 0.9266,
"step": 94
},
{
"epoch": 2.021276595744681,
"grad_norm": 0.08342910557985306,
"learning_rate": 2.8010097504334692e-05,
"loss": 0.0614,
"step": 95
},
{
"epoch": 2.0425531914893615,
"grad_norm": 0.46658873558044434,
"learning_rate": 2.692506551749165e-05,
"loss": 1.0568,
"step": 96
},
{
"epoch": 2.0638297872340425,
"grad_norm": 0.4580381512641907,
"learning_rate": 2.5853878801420582e-05,
"loss": 1.2753,
"step": 97
},
{
"epoch": 2.0851063829787235,
"grad_norm": 0.3629004955291748,
"learning_rate": 2.4797153385633147e-05,
"loss": 1.0902,
"step": 98
},
{
"epoch": 2.106382978723404,
"grad_norm": 0.3911716938018799,
"learning_rate": 2.3755496983081708e-05,
"loss": 1.0178,
"step": 99
},
{
"epoch": 2.127659574468085,
"grad_norm": 0.475111186504364,
"learning_rate": 2.2729508640669428e-05,
"loss": 0.8545,
"step": 100
},
{
"epoch": 2.127659574468085,
"eval_loss": 0.9274308681488037,
"eval_runtime": 3.2361,
"eval_samples_per_second": 195.604,
"eval_steps_per_second": 6.18,
"step": 100
},
{
"epoch": 2.148936170212766,
"grad_norm": 0.2774420976638794,
"learning_rate": 2.1719778394743813e-05,
"loss": 0.212,
"step": 101
},
{
"epoch": 2.1702127659574466,
"grad_norm": 0.3814822733402252,
"learning_rate": 2.0726886931772476e-05,
"loss": 0.7554,
"step": 102
},
{
"epoch": 2.1914893617021276,
"grad_norm": 0.423093318939209,
"learning_rate": 1.9751405254395587e-05,
"loss": 1.3008,
"step": 103
},
{
"epoch": 2.2127659574468086,
"grad_norm": 0.38136741518974304,
"learning_rate": 1.879389435304766e-05,
"loss": 1.1388,
"step": 104
},
{
"epoch": 2.2340425531914896,
"grad_norm": 0.39356493949890137,
"learning_rate": 1.7854904883337184e-05,
"loss": 0.9796,
"step": 105
},
{
"epoch": 2.25531914893617,
"grad_norm": 0.47382405400276184,
"learning_rate": 1.693497684936963e-05,
"loss": 0.8731,
"step": 106
},
{
"epoch": 2.276595744680851,
"grad_norm": 0.4495427906513214,
"learning_rate": 1.6034639293196224e-05,
"loss": 0.4196,
"step": 107
},
{
"epoch": 2.297872340425532,
"grad_norm": 0.26799333095550537,
"learning_rate": 1.515440999056669e-05,
"loss": 0.338,
"step": 108
},
{
"epoch": 2.3191489361702127,
"grad_norm": 0.4940812587738037,
"learning_rate": 1.429479515316127e-05,
"loss": 1.1992,
"step": 109
},
{
"epoch": 2.3404255319148937,
"grad_norm": 0.4212472140789032,
"learning_rate": 1.3456289137473124e-05,
"loss": 1.153,
"step": 110
},
{
"epoch": 2.3617021276595747,
"grad_norm": 0.4149324893951416,
"learning_rate": 1.263937416050847e-05,
"loss": 1.05,
"step": 111
},
{
"epoch": 2.382978723404255,
"grad_norm": 0.4490218460559845,
"learning_rate": 1.1844520022468092e-05,
"loss": 0.9362,
"step": 112
},
{
"epoch": 2.404255319148936,
"grad_norm": 0.5155778527259827,
"learning_rate": 1.1072183836569599e-05,
"loss": 0.615,
"step": 113
},
{
"epoch": 2.425531914893617,
"grad_norm": 0.1996319442987442,
"learning_rate": 1.0322809766165916e-05,
"loss": 0.1577,
"step": 114
},
{
"epoch": 2.4468085106382977,
"grad_norm": 0.5449104905128479,
"learning_rate": 9.596828769311028e-06,
"loss": 1.2252,
"step": 115
},
{
"epoch": 2.4680851063829787,
"grad_norm": 0.4638878405094147,
"learning_rate": 8.894658350919999e-06,
"loss": 1.2128,
"step": 116
},
{
"epoch": 2.4893617021276597,
"grad_norm": 0.4165002107620239,
"learning_rate": 8.216702322665849e-06,
"loss": 1.0802,
"step": 117
},
{
"epoch": 2.5106382978723403,
"grad_norm": 0.42553117871284485,
"learning_rate": 7.563350570751137e-06,
"loss": 0.9043,
"step": 118
},
{
"epoch": 2.5319148936170213,
"grad_norm": 0.5580489039421082,
"learning_rate": 6.934978831688112e-06,
"loss": 0.7406,
"step": 119
},
{
"epoch": 2.5531914893617023,
"grad_norm": 0.14457367360591888,
"learning_rate": 6.331948476216073e-06,
"loss": 0.0878,
"step": 120
},
{
"epoch": 2.574468085106383,
"grad_norm": 0.46198827028274536,
"learning_rate": 5.754606301480452e-06,
"loss": 1.0055,
"step": 121
},
{
"epoch": 2.595744680851064,
"grad_norm": 0.4651603102684021,
"learning_rate": 5.2032843315930305e-06,
"loss": 1.2232,
"step": 122
},
{
"epoch": 2.617021276595745,
"grad_norm": 0.41709625720977783,
"learning_rate": 4.678299626687903e-06,
"loss": 1.0644,
"step": 123
},
{
"epoch": 2.6382978723404253,
"grad_norm": 0.42545390129089355,
"learning_rate": 4.179954100583199e-06,
"loss": 0.9916,
"step": 124
},
{
"epoch": 2.6595744680851063,
"grad_norm": 0.5188893675804138,
"learning_rate": 3.708534347153212e-06,
"loss": 0.8641,
"step": 125
},
{
"epoch": 2.6808510638297873,
"grad_norm": 0.30658212304115295,
"learning_rate": 3.26431147551097e-06,
"loss": 0.1864,
"step": 126
},
{
"epoch": 2.702127659574468,
"grad_norm": 0.4011896252632141,
"learning_rate": 2.8475409540958616e-06,
"loss": 0.6371,
"step": 127
},
{
"epoch": 2.723404255319149,
"grad_norm": 0.4772135615348816,
"learning_rate": 2.45846246375617e-06,
"loss": 1.2719,
"step": 128
},
{
"epoch": 2.74468085106383,
"grad_norm": 0.44009700417518616,
"learning_rate": 2.097299759910797e-06,
"loss": 1.1568,
"step": 129
},
{
"epoch": 2.7659574468085104,
"grad_norm": 0.42015552520751953,
"learning_rate": 1.7642605438696306e-06,
"loss": 1.0539,
"step": 130
},
{
"epoch": 2.7872340425531914,
"grad_norm": 0.4982571303844452,
"learning_rate": 1.4595363433864484e-06,
"loss": 0.8517,
"step": 131
},
{
"epoch": 2.8085106382978724,
"grad_norm": 0.4780231714248657,
"learning_rate": 1.1833024025130858e-06,
"loss": 0.3794,
"step": 132
},
{
"epoch": 2.829787234042553,
"grad_norm": 0.3601129949092865,
"learning_rate": 9.357175808182305e-07,
"loss": 0.5229,
"step": 133
},
{
"epoch": 2.851063829787234,
"grad_norm": 0.5078785419464111,
"learning_rate": 7.169242620287227e-07,
"loss": 1.2511,
"step": 134
},
{
"epoch": 2.872340425531915,
"grad_norm": 0.42775991559028625,
"learning_rate": 5.270482721460563e-07,
"loss": 1.1546,
"step": 135
},
{
"epoch": 2.8936170212765955,
"grad_norm": 0.43165627121925354,
"learning_rate": 3.6619880708494724e-07,
"loss": 1.0596,
"step": 136
},
{
"epoch": 2.9148936170212765,
"grad_norm": 0.4631091356277466,
"learning_rate": 2.3446836987585295e-07,
"loss": 0.9146,
"step": 137
},
{
"epoch": 2.9361702127659575,
"grad_norm": 0.5159528851509094,
"learning_rate": 1.319327174672832e-07,
"loss": 0.5763,
"step": 138
},
{
"epoch": 2.9574468085106385,
"grad_norm": 0.426923006772995,
"learning_rate": 5.865081715870424e-08,
"loss": 0.7757,
"step": 139
},
{
"epoch": 2.978723404255319,
"grad_norm": 0.4215574264526367,
"learning_rate": 1.4664812689001438e-08,
"loss": 1.14,
"step": 140
},
{
"epoch": 3.0,
"grad_norm": 0.5064499378204346,
"learning_rate": 0.0,
"loss": 0.8367,
"step": 141
}
],
"logging_steps": 1,
"max_steps": 141,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.466930952990884e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}