cantonesellm-cpt-202405 / trainer_state.json
indiejoseph's picture
Model save
8ddabd8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9991804622193083,
"eval_steps": 500,
"global_step": 1143,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008741736327378026,
"grad_norm": 19.75,
"learning_rate": 8.620689655172415e-07,
"loss": 4.7931,
"step": 1
},
{
"epoch": 0.0017483472654756052,
"grad_norm": 21.75,
"learning_rate": 1.724137931034483e-06,
"loss": 4.937,
"step": 2
},
{
"epoch": 0.0026225208982134074,
"grad_norm": 20.625,
"learning_rate": 2.586206896551724e-06,
"loss": 5.0176,
"step": 3
},
{
"epoch": 0.0034966945309512104,
"grad_norm": 19.875,
"learning_rate": 3.448275862068966e-06,
"loss": 4.8236,
"step": 4
},
{
"epoch": 0.004370868163689013,
"grad_norm": 16.875,
"learning_rate": 4.310344827586207e-06,
"loss": 4.761,
"step": 5
},
{
"epoch": 0.005245041796426815,
"grad_norm": 16.125,
"learning_rate": 5.172413793103448e-06,
"loss": 4.9055,
"step": 6
},
{
"epoch": 0.006119215429164618,
"grad_norm": 11.5625,
"learning_rate": 6.03448275862069e-06,
"loss": 4.6787,
"step": 7
},
{
"epoch": 0.006993389061902421,
"grad_norm": 14.5625,
"learning_rate": 6.896551724137932e-06,
"loss": 4.6797,
"step": 8
},
{
"epoch": 0.007867562694640224,
"grad_norm": 19.0,
"learning_rate": 7.758620689655173e-06,
"loss": 4.6406,
"step": 9
},
{
"epoch": 0.008741736327378026,
"grad_norm": 12.0625,
"learning_rate": 8.620689655172414e-06,
"loss": 4.5986,
"step": 10
},
{
"epoch": 0.009615909960115828,
"grad_norm": 7.8125,
"learning_rate": 9.482758620689655e-06,
"loss": 4.5762,
"step": 11
},
{
"epoch": 0.01049008359285363,
"grad_norm": 6.875,
"learning_rate": 1.0344827586206897e-05,
"loss": 4.5882,
"step": 12
},
{
"epoch": 0.011364257225591434,
"grad_norm": 6.0,
"learning_rate": 1.1206896551724138e-05,
"loss": 4.3547,
"step": 13
},
{
"epoch": 0.012238430858329236,
"grad_norm": 6.125,
"learning_rate": 1.206896551724138e-05,
"loss": 4.2879,
"step": 14
},
{
"epoch": 0.013112604491067038,
"grad_norm": 5.875,
"learning_rate": 1.2931034482758622e-05,
"loss": 4.3241,
"step": 15
},
{
"epoch": 0.013986778123804841,
"grad_norm": 4.625,
"learning_rate": 1.3793103448275863e-05,
"loss": 4.386,
"step": 16
},
{
"epoch": 0.014860951756542643,
"grad_norm": 3.390625,
"learning_rate": 1.4655172413793103e-05,
"loss": 4.266,
"step": 17
},
{
"epoch": 0.015735125389280447,
"grad_norm": 3.296875,
"learning_rate": 1.5517241379310346e-05,
"loss": 4.2994,
"step": 18
},
{
"epoch": 0.01660929902201825,
"grad_norm": 2.796875,
"learning_rate": 1.6379310344827585e-05,
"loss": 4.1198,
"step": 19
},
{
"epoch": 0.01748347265475605,
"grad_norm": 4.0625,
"learning_rate": 1.7241379310344828e-05,
"loss": 4.1751,
"step": 20
},
{
"epoch": 0.018357646287493853,
"grad_norm": 3.109375,
"learning_rate": 1.810344827586207e-05,
"loss": 3.9965,
"step": 21
},
{
"epoch": 0.019231819920231655,
"grad_norm": 2.875,
"learning_rate": 1.896551724137931e-05,
"loss": 3.9684,
"step": 22
},
{
"epoch": 0.020105993552969458,
"grad_norm": 2.953125,
"learning_rate": 1.9827586206896554e-05,
"loss": 3.9812,
"step": 23
},
{
"epoch": 0.02098016718570726,
"grad_norm": 2.28125,
"learning_rate": 2.0689655172413793e-05,
"loss": 3.9997,
"step": 24
},
{
"epoch": 0.021854340818445065,
"grad_norm": 2.53125,
"learning_rate": 2.1551724137931033e-05,
"loss": 3.9327,
"step": 25
},
{
"epoch": 0.022728514451182867,
"grad_norm": 2.09375,
"learning_rate": 2.2413793103448276e-05,
"loss": 3.937,
"step": 26
},
{
"epoch": 0.02360268808392067,
"grad_norm": 2.125,
"learning_rate": 2.327586206896552e-05,
"loss": 3.8923,
"step": 27
},
{
"epoch": 0.02447686171665847,
"grad_norm": 1.6171875,
"learning_rate": 2.413793103448276e-05,
"loss": 3.8121,
"step": 28
},
{
"epoch": 0.025351035349396273,
"grad_norm": 1.671875,
"learning_rate": 2.5e-05,
"loss": 3.8329,
"step": 29
},
{
"epoch": 0.026225208982134075,
"grad_norm": 1.5625,
"learning_rate": 2.5862068965517244e-05,
"loss": 3.7719,
"step": 30
},
{
"epoch": 0.027099382614871877,
"grad_norm": 1.59375,
"learning_rate": 2.672413793103448e-05,
"loss": 3.8168,
"step": 31
},
{
"epoch": 0.027973556247609683,
"grad_norm": 1.6796875,
"learning_rate": 2.7586206896551727e-05,
"loss": 3.6792,
"step": 32
},
{
"epoch": 0.028847729880347485,
"grad_norm": 1.421875,
"learning_rate": 2.844827586206897e-05,
"loss": 3.6723,
"step": 33
},
{
"epoch": 0.029721903513085287,
"grad_norm": 1.3671875,
"learning_rate": 2.9310344827586206e-05,
"loss": 3.6362,
"step": 34
},
{
"epoch": 0.03059607714582309,
"grad_norm": 1.671875,
"learning_rate": 3.017241379310345e-05,
"loss": 3.6452,
"step": 35
},
{
"epoch": 0.031470250778560895,
"grad_norm": 1.8046875,
"learning_rate": 3.103448275862069e-05,
"loss": 3.5118,
"step": 36
},
{
"epoch": 0.03234442441129869,
"grad_norm": 1.8125,
"learning_rate": 3.1896551724137935e-05,
"loss": 3.4852,
"step": 37
},
{
"epoch": 0.0332185980440365,
"grad_norm": 1.484375,
"learning_rate": 3.275862068965517e-05,
"loss": 3.3851,
"step": 38
},
{
"epoch": 0.0340927716767743,
"grad_norm": 1.3359375,
"learning_rate": 3.3620689655172414e-05,
"loss": 3.3676,
"step": 39
},
{
"epoch": 0.0349669453095121,
"grad_norm": 1.4375,
"learning_rate": 3.4482758620689657e-05,
"loss": 3.4513,
"step": 40
},
{
"epoch": 0.0358411189422499,
"grad_norm": 1.7421875,
"learning_rate": 3.53448275862069e-05,
"loss": 3.3572,
"step": 41
},
{
"epoch": 0.03671529257498771,
"grad_norm": 1.6796875,
"learning_rate": 3.620689655172414e-05,
"loss": 3.229,
"step": 42
},
{
"epoch": 0.03758946620772551,
"grad_norm": 1.3046875,
"learning_rate": 3.7068965517241385e-05,
"loss": 3.2683,
"step": 43
},
{
"epoch": 0.03846363984046331,
"grad_norm": 2.078125,
"learning_rate": 3.793103448275862e-05,
"loss": 3.2209,
"step": 44
},
{
"epoch": 0.039337813473201116,
"grad_norm": 1.3515625,
"learning_rate": 3.8793103448275865e-05,
"loss": 3.3169,
"step": 45
},
{
"epoch": 0.040211987105938915,
"grad_norm": 1.3046875,
"learning_rate": 3.965517241379311e-05,
"loss": 3.2609,
"step": 46
},
{
"epoch": 0.04108616073867672,
"grad_norm": 1.75,
"learning_rate": 4.0517241379310344e-05,
"loss": 3.2348,
"step": 47
},
{
"epoch": 0.04196033437141452,
"grad_norm": 1.5546875,
"learning_rate": 4.1379310344827587e-05,
"loss": 3.2157,
"step": 48
},
{
"epoch": 0.042834508004152325,
"grad_norm": 1.3046875,
"learning_rate": 4.224137931034483e-05,
"loss": 3.118,
"step": 49
},
{
"epoch": 0.04370868163689013,
"grad_norm": 1.265625,
"learning_rate": 4.3103448275862066e-05,
"loss": 3.1812,
"step": 50
},
{
"epoch": 0.04458285526962793,
"grad_norm": 1.28125,
"learning_rate": 4.396551724137931e-05,
"loss": 3.1164,
"step": 51
},
{
"epoch": 0.045457028902365734,
"grad_norm": 1.296875,
"learning_rate": 4.482758620689655e-05,
"loss": 3.1016,
"step": 52
},
{
"epoch": 0.04633120253510353,
"grad_norm": 1.3046875,
"learning_rate": 4.5689655172413794e-05,
"loss": 3.0119,
"step": 53
},
{
"epoch": 0.04720537616784134,
"grad_norm": 1.265625,
"learning_rate": 4.655172413793104e-05,
"loss": 3.0376,
"step": 54
},
{
"epoch": 0.04807954980057914,
"grad_norm": 1.359375,
"learning_rate": 4.741379310344828e-05,
"loss": 3.0525,
"step": 55
},
{
"epoch": 0.04895372343331694,
"grad_norm": 1.21875,
"learning_rate": 4.827586206896552e-05,
"loss": 3.0417,
"step": 56
},
{
"epoch": 0.04982789706605475,
"grad_norm": 1.1640625,
"learning_rate": 4.913793103448276e-05,
"loss": 2.9921,
"step": 57
},
{
"epoch": 0.050702070698792547,
"grad_norm": 1.390625,
"learning_rate": 5e-05,
"loss": 2.9874,
"step": 58
},
{
"epoch": 0.05157624433153035,
"grad_norm": 1.546875,
"learning_rate": 4.9999895202727756e-05,
"loss": 2.9822,
"step": 59
},
{
"epoch": 0.05245041796426815,
"grad_norm": 1.4375,
"learning_rate": 4.9999580811789614e-05,
"loss": 2.9278,
"step": 60
},
{
"epoch": 0.053324591597005956,
"grad_norm": 1.171875,
"learning_rate": 4.999905682982135e-05,
"loss": 2.9424,
"step": 61
},
{
"epoch": 0.054198765229743755,
"grad_norm": 1.21875,
"learning_rate": 4.999832326121594e-05,
"loss": 2.8771,
"step": 62
},
{
"epoch": 0.05507293886248156,
"grad_norm": 1.15625,
"learning_rate": 4.999738011212344e-05,
"loss": 2.9132,
"step": 63
},
{
"epoch": 0.055947112495219366,
"grad_norm": 1.15625,
"learning_rate": 4.999622739045101e-05,
"loss": 2.9479,
"step": 64
},
{
"epoch": 0.056821286127957164,
"grad_norm": 1.2890625,
"learning_rate": 4.999486510586282e-05,
"loss": 2.948,
"step": 65
},
{
"epoch": 0.05769545976069497,
"grad_norm": 1.3828125,
"learning_rate": 4.9993293269779975e-05,
"loss": 2.948,
"step": 66
},
{
"epoch": 0.05856963339343277,
"grad_norm": 1.28125,
"learning_rate": 4.9991511895380396e-05,
"loss": 2.9111,
"step": 67
},
{
"epoch": 0.059443807026170574,
"grad_norm": 1.375,
"learning_rate": 4.998952099759874e-05,
"loss": 2.9496,
"step": 68
},
{
"epoch": 0.06031798065890837,
"grad_norm": 1.125,
"learning_rate": 4.998732059312625e-05,
"loss": 2.8007,
"step": 69
},
{
"epoch": 0.06119215429164618,
"grad_norm": 1.40625,
"learning_rate": 4.998491070041066e-05,
"loss": 2.8642,
"step": 70
},
{
"epoch": 0.062066327924383984,
"grad_norm": 1.21875,
"learning_rate": 4.998229133965596e-05,
"loss": 2.8762,
"step": 71
},
{
"epoch": 0.06294050155712179,
"grad_norm": 1.109375,
"learning_rate": 4.997946253282231e-05,
"loss": 2.8961,
"step": 72
},
{
"epoch": 0.06381467518985959,
"grad_norm": 1.25,
"learning_rate": 4.9976424303625815e-05,
"loss": 2.825,
"step": 73
},
{
"epoch": 0.06468884882259739,
"grad_norm": 1.1953125,
"learning_rate": 4.997317667753831e-05,
"loss": 2.8532,
"step": 74
},
{
"epoch": 0.06556302245533518,
"grad_norm": 1.09375,
"learning_rate": 4.9969719681787196e-05,
"loss": 2.9245,
"step": 75
},
{
"epoch": 0.066437196088073,
"grad_norm": 1.0859375,
"learning_rate": 4.9966053345355174e-05,
"loss": 2.7549,
"step": 76
},
{
"epoch": 0.0673113697208108,
"grad_norm": 1.046875,
"learning_rate": 4.9962177698979995e-05,
"loss": 2.8295,
"step": 77
},
{
"epoch": 0.0681855433535486,
"grad_norm": 1.1328125,
"learning_rate": 4.995809277515424e-05,
"loss": 2.7792,
"step": 78
},
{
"epoch": 0.0690597169862864,
"grad_norm": 1.1015625,
"learning_rate": 4.9953798608125025e-05,
"loss": 2.7635,
"step": 79
},
{
"epoch": 0.0699338906190242,
"grad_norm": 1.3203125,
"learning_rate": 4.99492952338937e-05,
"loss": 2.8317,
"step": 80
},
{
"epoch": 0.070808064251762,
"grad_norm": 1.1171875,
"learning_rate": 4.994458269021557e-05,
"loss": 2.7627,
"step": 81
},
{
"epoch": 0.0716822378844998,
"grad_norm": 1.09375,
"learning_rate": 4.993966101659958e-05,
"loss": 2.8273,
"step": 82
},
{
"epoch": 0.07255641151723762,
"grad_norm": 1.1171875,
"learning_rate": 4.993453025430797e-05,
"loss": 2.8587,
"step": 83
},
{
"epoch": 0.07343058514997541,
"grad_norm": 1.1953125,
"learning_rate": 4.992919044635592e-05,
"loss": 2.8023,
"step": 84
},
{
"epoch": 0.07430475878271321,
"grad_norm": 1.140625,
"learning_rate": 4.9923641637511226e-05,
"loss": 2.6944,
"step": 85
},
{
"epoch": 0.07517893241545102,
"grad_norm": 1.1640625,
"learning_rate": 4.991788387429388e-05,
"loss": 2.7955,
"step": 86
},
{
"epoch": 0.07605310604818882,
"grad_norm": 1.3671875,
"learning_rate": 4.9911917204975724e-05,
"loss": 2.8184,
"step": 87
},
{
"epoch": 0.07692727968092662,
"grad_norm": 1.3671875,
"learning_rate": 4.9905741679580007e-05,
"loss": 2.8002,
"step": 88
},
{
"epoch": 0.07780145331366442,
"grad_norm": 1.1328125,
"learning_rate": 4.989935734988098e-05,
"loss": 2.7749,
"step": 89
},
{
"epoch": 0.07867562694640223,
"grad_norm": 1.2578125,
"learning_rate": 4.989276426940348e-05,
"loss": 2.8351,
"step": 90
},
{
"epoch": 0.07954980057914003,
"grad_norm": 0.9609375,
"learning_rate": 4.988596249342244e-05,
"loss": 2.7638,
"step": 91
},
{
"epoch": 0.08042397421187783,
"grad_norm": 1.2265625,
"learning_rate": 4.987895207896248e-05,
"loss": 2.7492,
"step": 92
},
{
"epoch": 0.08129814784461564,
"grad_norm": 1.1328125,
"learning_rate": 4.987173308479738e-05,
"loss": 2.7668,
"step": 93
},
{
"epoch": 0.08217232147735344,
"grad_norm": 1.2265625,
"learning_rate": 4.9864305571449616e-05,
"loss": 2.7527,
"step": 94
},
{
"epoch": 0.08304649511009124,
"grad_norm": 1.109375,
"learning_rate": 4.985666960118983e-05,
"loss": 2.7963,
"step": 95
},
{
"epoch": 0.08392066874282904,
"grad_norm": 1.109375,
"learning_rate": 4.984882523803634e-05,
"loss": 2.7924,
"step": 96
},
{
"epoch": 0.08479484237556685,
"grad_norm": 1.4375,
"learning_rate": 4.9840772547754566e-05,
"loss": 2.763,
"step": 97
},
{
"epoch": 0.08566901600830465,
"grad_norm": 1.203125,
"learning_rate": 4.983251159785651e-05,
"loss": 2.7398,
"step": 98
},
{
"epoch": 0.08654318964104245,
"grad_norm": 1.1875,
"learning_rate": 4.982404245760018e-05,
"loss": 2.7528,
"step": 99
},
{
"epoch": 0.08741736327378026,
"grad_norm": 1.1328125,
"learning_rate": 4.9815365197988986e-05,
"loss": 2.8205,
"step": 100
},
{
"epoch": 0.08829153690651806,
"grad_norm": 1.109375,
"learning_rate": 4.9806479891771195e-05,
"loss": 2.7228,
"step": 101
},
{
"epoch": 0.08916571053925586,
"grad_norm": 1.03125,
"learning_rate": 4.9797386613439265e-05,
"loss": 2.7599,
"step": 102
},
{
"epoch": 0.09003988417199366,
"grad_norm": 1.0703125,
"learning_rate": 4.978808543922925e-05,
"loss": 2.7388,
"step": 103
},
{
"epoch": 0.09091405780473147,
"grad_norm": 1.1484375,
"learning_rate": 4.9778576447120184e-05,
"loss": 2.7801,
"step": 104
},
{
"epoch": 0.09178823143746927,
"grad_norm": 0.97265625,
"learning_rate": 4.976885971683337e-05,
"loss": 2.656,
"step": 105
},
{
"epoch": 0.09266240507020707,
"grad_norm": 1.078125,
"learning_rate": 4.9758935329831754e-05,
"loss": 2.763,
"step": 106
},
{
"epoch": 0.09353657870294488,
"grad_norm": 1.078125,
"learning_rate": 4.974880336931923e-05,
"loss": 2.7975,
"step": 107
},
{
"epoch": 0.09441075233568268,
"grad_norm": 1.078125,
"learning_rate": 4.9738463920239955e-05,
"loss": 2.7029,
"step": 108
},
{
"epoch": 0.09528492596842048,
"grad_norm": 1.0546875,
"learning_rate": 4.972791706927759e-05,
"loss": 2.689,
"step": 109
},
{
"epoch": 0.09615909960115827,
"grad_norm": 1.0546875,
"learning_rate": 4.9717162904854664e-05,
"loss": 2.7322,
"step": 110
},
{
"epoch": 0.09703327323389609,
"grad_norm": 1.0546875,
"learning_rate": 4.9706201517131725e-05,
"loss": 2.778,
"step": 111
},
{
"epoch": 0.09790744686663388,
"grad_norm": 1.0703125,
"learning_rate": 4.9695032998006655e-05,
"loss": 2.8284,
"step": 112
},
{
"epoch": 0.09878162049937168,
"grad_norm": 1.0546875,
"learning_rate": 4.9683657441113884e-05,
"loss": 2.71,
"step": 113
},
{
"epoch": 0.0996557941321095,
"grad_norm": 1.2109375,
"learning_rate": 4.967207494182361e-05,
"loss": 2.6782,
"step": 114
},
{
"epoch": 0.1005299677648473,
"grad_norm": 1.2421875,
"learning_rate": 4.966028559724096e-05,
"loss": 2.706,
"step": 115
},
{
"epoch": 0.10140414139758509,
"grad_norm": 1.09375,
"learning_rate": 4.964828950620524e-05,
"loss": 2.7667,
"step": 116
},
{
"epoch": 0.10227831503032289,
"grad_norm": 0.94921875,
"learning_rate": 4.963608676928905e-05,
"loss": 2.685,
"step": 117
},
{
"epoch": 0.1031524886630607,
"grad_norm": 1.0234375,
"learning_rate": 4.962367748879748e-05,
"loss": 2.6407,
"step": 118
},
{
"epoch": 0.1040266622957985,
"grad_norm": 1.1015625,
"learning_rate": 4.961106176876723e-05,
"loss": 2.662,
"step": 119
},
{
"epoch": 0.1049008359285363,
"grad_norm": 1.0078125,
"learning_rate": 4.959823971496574e-05,
"loss": 2.7101,
"step": 120
},
{
"epoch": 0.10577500956127411,
"grad_norm": 1.0390625,
"learning_rate": 4.958521143489032e-05,
"loss": 2.7607,
"step": 121
},
{
"epoch": 0.10664918319401191,
"grad_norm": 1.0078125,
"learning_rate": 4.9571977037767217e-05,
"loss": 2.6531,
"step": 122
},
{
"epoch": 0.10752335682674971,
"grad_norm": 0.98828125,
"learning_rate": 4.955853663455072e-05,
"loss": 2.6706,
"step": 123
},
{
"epoch": 0.10839753045948751,
"grad_norm": 1.0390625,
"learning_rate": 4.954489033792227e-05,
"loss": 2.6516,
"step": 124
},
{
"epoch": 0.10927170409222532,
"grad_norm": 1.1015625,
"learning_rate": 4.95310382622894e-05,
"loss": 2.6962,
"step": 125
},
{
"epoch": 0.11014587772496312,
"grad_norm": 1.015625,
"learning_rate": 4.951698052378492e-05,
"loss": 2.702,
"step": 126
},
{
"epoch": 0.11102005135770092,
"grad_norm": 1.0390625,
"learning_rate": 4.950271724026582e-05,
"loss": 2.6833,
"step": 127
},
{
"epoch": 0.11189422499043873,
"grad_norm": 1.234375,
"learning_rate": 4.948824853131236e-05,
"loss": 2.691,
"step": 128
},
{
"epoch": 0.11276839862317653,
"grad_norm": 0.9921875,
"learning_rate": 4.947357451822706e-05,
"loss": 2.64,
"step": 129
},
{
"epoch": 0.11364257225591433,
"grad_norm": 1.09375,
"learning_rate": 4.945869532403362e-05,
"loss": 2.6507,
"step": 130
},
{
"epoch": 0.11451674588865213,
"grad_norm": 0.984375,
"learning_rate": 4.944361107347597e-05,
"loss": 2.7446,
"step": 131
},
{
"epoch": 0.11539091952138994,
"grad_norm": 1.09375,
"learning_rate": 4.942832189301716e-05,
"loss": 2.6651,
"step": 132
},
{
"epoch": 0.11626509315412774,
"grad_norm": 1.1015625,
"learning_rate": 4.941282791083836e-05,
"loss": 2.6495,
"step": 133
},
{
"epoch": 0.11713926678686554,
"grad_norm": 1.109375,
"learning_rate": 4.9397129256837724e-05,
"loss": 2.6474,
"step": 134
},
{
"epoch": 0.11801344041960335,
"grad_norm": 1.171875,
"learning_rate": 4.938122606262936e-05,
"loss": 2.6893,
"step": 135
},
{
"epoch": 0.11888761405234115,
"grad_norm": 1.1171875,
"learning_rate": 4.936511846154215e-05,
"loss": 2.6667,
"step": 136
},
{
"epoch": 0.11976178768507895,
"grad_norm": 1.0234375,
"learning_rate": 4.934880658861872e-05,
"loss": 2.7114,
"step": 137
},
{
"epoch": 0.12063596131781675,
"grad_norm": 1.0859375,
"learning_rate": 4.933229058061425e-05,
"loss": 2.6641,
"step": 138
},
{
"epoch": 0.12151013495055456,
"grad_norm": 1.109375,
"learning_rate": 4.9315570575995364e-05,
"loss": 2.7359,
"step": 139
},
{
"epoch": 0.12238430858329236,
"grad_norm": 1.0703125,
"learning_rate": 4.92986467149389e-05,
"loss": 2.6406,
"step": 140
},
{
"epoch": 0.12325848221603015,
"grad_norm": 1.0,
"learning_rate": 4.9281519139330846e-05,
"loss": 2.6395,
"step": 141
},
{
"epoch": 0.12413265584876797,
"grad_norm": 1.1015625,
"learning_rate": 4.926418799276504e-05,
"loss": 2.664,
"step": 142
},
{
"epoch": 0.12500682948150577,
"grad_norm": 0.99609375,
"learning_rate": 4.924665342054204e-05,
"loss": 2.6725,
"step": 143
},
{
"epoch": 0.12588100311424358,
"grad_norm": 1.0625,
"learning_rate": 4.922891556966788e-05,
"loss": 2.6244,
"step": 144
},
{
"epoch": 0.12675517674698136,
"grad_norm": 1.078125,
"learning_rate": 4.921097458885282e-05,
"loss": 2.6786,
"step": 145
},
{
"epoch": 0.12762935037971918,
"grad_norm": 0.9765625,
"learning_rate": 4.9192830628510126e-05,
"loss": 2.7084,
"step": 146
},
{
"epoch": 0.128503524012457,
"grad_norm": 1.1640625,
"learning_rate": 4.9174483840754815e-05,
"loss": 2.688,
"step": 147
},
{
"epoch": 0.12937769764519477,
"grad_norm": 0.99609375,
"learning_rate": 4.9155934379402335e-05,
"loss": 2.6582,
"step": 148
},
{
"epoch": 0.13025187127793258,
"grad_norm": 1.0625,
"learning_rate": 4.9137182399967343e-05,
"loss": 2.6099,
"step": 149
},
{
"epoch": 0.13112604491067037,
"grad_norm": 0.96484375,
"learning_rate": 4.911822805966232e-05,
"loss": 2.6315,
"step": 150
},
{
"epoch": 0.13200021854340818,
"grad_norm": 1.109375,
"learning_rate": 4.909907151739633e-05,
"loss": 2.6418,
"step": 151
},
{
"epoch": 0.132874392176146,
"grad_norm": 0.94140625,
"learning_rate": 4.907971293377365e-05,
"loss": 2.6344,
"step": 152
},
{
"epoch": 0.13374856580888378,
"grad_norm": 1.078125,
"learning_rate": 4.9060152471092414e-05,
"loss": 2.6904,
"step": 153
},
{
"epoch": 0.1346227394416216,
"grad_norm": 1.0625,
"learning_rate": 4.904039029334326e-05,
"loss": 2.6464,
"step": 154
},
{
"epoch": 0.1354969130743594,
"grad_norm": 0.95703125,
"learning_rate": 4.9020426566207997e-05,
"loss": 2.6811,
"step": 155
},
{
"epoch": 0.1363710867070972,
"grad_norm": 1.0078125,
"learning_rate": 4.900026145705815e-05,
"loss": 2.6346,
"step": 156
},
{
"epoch": 0.137245260339835,
"grad_norm": 0.9375,
"learning_rate": 4.897989513495358e-05,
"loss": 2.6762,
"step": 157
},
{
"epoch": 0.1381194339725728,
"grad_norm": 1.0546875,
"learning_rate": 4.89593277706411e-05,
"loss": 2.6383,
"step": 158
},
{
"epoch": 0.1389936076053106,
"grad_norm": 0.9765625,
"learning_rate": 4.8938559536552994e-05,
"loss": 2.634,
"step": 159
},
{
"epoch": 0.1398677812380484,
"grad_norm": 1.0078125,
"learning_rate": 4.891759060680562e-05,
"loss": 2.6626,
"step": 160
},
{
"epoch": 0.14074195487078622,
"grad_norm": 0.9609375,
"learning_rate": 4.8896421157197896e-05,
"loss": 2.664,
"step": 161
},
{
"epoch": 0.141616128503524,
"grad_norm": 1.046875,
"learning_rate": 4.887505136520987e-05,
"loss": 2.6787,
"step": 162
},
{
"epoch": 0.14249030213626182,
"grad_norm": 1.015625,
"learning_rate": 4.885348141000122e-05,
"loss": 2.6107,
"step": 163
},
{
"epoch": 0.1433644757689996,
"grad_norm": 0.97265625,
"learning_rate": 4.883171147240975e-05,
"loss": 2.6128,
"step": 164
},
{
"epoch": 0.14423864940173742,
"grad_norm": 1.0859375,
"learning_rate": 4.880974173494984e-05,
"loss": 2.6087,
"step": 165
},
{
"epoch": 0.14511282303447523,
"grad_norm": 1.0234375,
"learning_rate": 4.8787572381811e-05,
"loss": 2.6377,
"step": 166
},
{
"epoch": 0.14598699666721301,
"grad_norm": 1.0078125,
"learning_rate": 4.876520359885624e-05,
"loss": 2.6326,
"step": 167
},
{
"epoch": 0.14686117029995083,
"grad_norm": 1.015625,
"learning_rate": 4.874263557362056e-05,
"loss": 2.6361,
"step": 168
},
{
"epoch": 0.14773534393268864,
"grad_norm": 1.0859375,
"learning_rate": 4.871986849530934e-05,
"loss": 2.7243,
"step": 169
},
{
"epoch": 0.14860951756542642,
"grad_norm": 0.96875,
"learning_rate": 4.869690255479682e-05,
"loss": 2.6845,
"step": 170
},
{
"epoch": 0.14948369119816424,
"grad_norm": 1.0234375,
"learning_rate": 4.867373794462442e-05,
"loss": 2.6677,
"step": 171
},
{
"epoch": 0.15035786483090205,
"grad_norm": 1.046875,
"learning_rate": 4.8650374858999185e-05,
"loss": 2.659,
"step": 172
},
{
"epoch": 0.15123203846363983,
"grad_norm": 0.9921875,
"learning_rate": 4.862681349379212e-05,
"loss": 2.6327,
"step": 173
},
{
"epoch": 0.15210621209637765,
"grad_norm": 1.2265625,
"learning_rate": 4.860305404653657e-05,
"loss": 2.6229,
"step": 174
},
{
"epoch": 0.15298038572911546,
"grad_norm": 1.0234375,
"learning_rate": 4.857909671642656e-05,
"loss": 2.618,
"step": 175
},
{
"epoch": 0.15385455936185324,
"grad_norm": 1.0078125,
"learning_rate": 4.8554941704315116e-05,
"loss": 2.5778,
"step": 176
},
{
"epoch": 0.15472873299459106,
"grad_norm": 0.94921875,
"learning_rate": 4.853058921271259e-05,
"loss": 2.5795,
"step": 177
},
{
"epoch": 0.15560290662732884,
"grad_norm": 1.0390625,
"learning_rate": 4.850603944578494e-05,
"loss": 2.6069,
"step": 178
},
{
"epoch": 0.15647708026006665,
"grad_norm": 0.9765625,
"learning_rate": 4.848129260935208e-05,
"loss": 2.6211,
"step": 179
},
{
"epoch": 0.15735125389280447,
"grad_norm": 1.1171875,
"learning_rate": 4.845634891088608e-05,
"loss": 2.601,
"step": 180
},
{
"epoch": 0.15822542752554225,
"grad_norm": 0.98828125,
"learning_rate": 4.8431208559509456e-05,
"loss": 2.6104,
"step": 181
},
{
"epoch": 0.15909960115828006,
"grad_norm": 1.0625,
"learning_rate": 4.8405871765993433e-05,
"loss": 2.6695,
"step": 182
},
{
"epoch": 0.15997377479101788,
"grad_norm": 1.1484375,
"learning_rate": 4.8380338742756157e-05,
"loss": 2.6339,
"step": 183
},
{
"epoch": 0.16084794842375566,
"grad_norm": 0.9609375,
"learning_rate": 4.835460970386093e-05,
"loss": 2.6176,
"step": 184
},
{
"epoch": 0.16172212205649347,
"grad_norm": 1.1015625,
"learning_rate": 4.8328684865014386e-05,
"loss": 2.6188,
"step": 185
},
{
"epoch": 0.16259629568923128,
"grad_norm": 0.9140625,
"learning_rate": 4.830256444356473e-05,
"loss": 2.5651,
"step": 186
},
{
"epoch": 0.16347046932196907,
"grad_norm": 1.078125,
"learning_rate": 4.827624865849987e-05,
"loss": 2.6513,
"step": 187
},
{
"epoch": 0.16434464295470688,
"grad_norm": 0.953125,
"learning_rate": 4.82497377304456e-05,
"loss": 2.6408,
"step": 188
},
{
"epoch": 0.1652188165874447,
"grad_norm": 1.015625,
"learning_rate": 4.822303188166377e-05,
"loss": 2.6039,
"step": 189
},
{
"epoch": 0.16609299022018248,
"grad_norm": 0.921875,
"learning_rate": 4.819613133605036e-05,
"loss": 2.6749,
"step": 190
},
{
"epoch": 0.1669671638529203,
"grad_norm": 1.171875,
"learning_rate": 4.816903631913372e-05,
"loss": 2.602,
"step": 191
},
{
"epoch": 0.16784133748565808,
"grad_norm": 0.9921875,
"learning_rate": 4.814174705807252e-05,
"loss": 2.5986,
"step": 192
},
{
"epoch": 0.1687155111183959,
"grad_norm": 1.046875,
"learning_rate": 4.811426378165398e-05,
"loss": 2.5921,
"step": 193
},
{
"epoch": 0.1695896847511337,
"grad_norm": 0.984375,
"learning_rate": 4.808658672029189e-05,
"loss": 2.5958,
"step": 194
},
{
"epoch": 0.17046385838387149,
"grad_norm": 1.2265625,
"learning_rate": 4.8058716106024705e-05,
"loss": 2.5892,
"step": 195
},
{
"epoch": 0.1713380320166093,
"grad_norm": 0.99609375,
"learning_rate": 4.803065217251357e-05,
"loss": 2.5633,
"step": 196
},
{
"epoch": 0.1722122056493471,
"grad_norm": 0.98046875,
"learning_rate": 4.800239515504036e-05,
"loss": 2.6577,
"step": 197
},
{
"epoch": 0.1730863792820849,
"grad_norm": 0.94921875,
"learning_rate": 4.7973945290505766e-05,
"loss": 2.6721,
"step": 198
},
{
"epoch": 0.1739605529148227,
"grad_norm": 1.0625,
"learning_rate": 4.794530281742724e-05,
"loss": 2.6837,
"step": 199
},
{
"epoch": 0.17483472654756052,
"grad_norm": 0.9296875,
"learning_rate": 4.791646797593702e-05,
"loss": 2.5801,
"step": 200
},
{
"epoch": 0.1757089001802983,
"grad_norm": 1.0078125,
"learning_rate": 4.7887441007780123e-05,
"loss": 2.5675,
"step": 201
},
{
"epoch": 0.17658307381303612,
"grad_norm": 0.953125,
"learning_rate": 4.7858222156312316e-05,
"loss": 2.6157,
"step": 202
},
{
"epoch": 0.17745724744577393,
"grad_norm": 0.95703125,
"learning_rate": 4.782881166649808e-05,
"loss": 2.6109,
"step": 203
},
{
"epoch": 0.17833142107851171,
"grad_norm": 0.9609375,
"learning_rate": 4.779920978490854e-05,
"loss": 2.5524,
"step": 204
},
{
"epoch": 0.17920559471124953,
"grad_norm": 0.953125,
"learning_rate": 4.776941675971941e-05,
"loss": 2.6292,
"step": 205
},
{
"epoch": 0.1800797683439873,
"grad_norm": 0.94140625,
"learning_rate": 4.773943284070892e-05,
"loss": 2.5868,
"step": 206
},
{
"epoch": 0.18095394197672512,
"grad_norm": 1.0234375,
"learning_rate": 4.7709258279255696e-05,
"loss": 2.5811,
"step": 207
},
{
"epoch": 0.18182811560946294,
"grad_norm": 0.98828125,
"learning_rate": 4.767889332833667e-05,
"loss": 2.6033,
"step": 208
},
{
"epoch": 0.18270228924220072,
"grad_norm": 0.953125,
"learning_rate": 4.764833824252498e-05,
"loss": 2.5816,
"step": 209
},
{
"epoch": 0.18357646287493853,
"grad_norm": 1.03125,
"learning_rate": 4.7617593277987794e-05,
"loss": 2.6657,
"step": 210
},
{
"epoch": 0.18445063650767635,
"grad_norm": 0.90234375,
"learning_rate": 4.758665869248417e-05,
"loss": 2.5748,
"step": 211
},
{
"epoch": 0.18532481014041413,
"grad_norm": 0.9375,
"learning_rate": 4.755553474536294e-05,
"loss": 2.6091,
"step": 212
},
{
"epoch": 0.18619898377315194,
"grad_norm": 0.921875,
"learning_rate": 4.752422169756048e-05,
"loss": 2.5747,
"step": 213
},
{
"epoch": 0.18707315740588976,
"grad_norm": 0.8984375,
"learning_rate": 4.749271981159855e-05,
"loss": 2.6302,
"step": 214
},
{
"epoch": 0.18794733103862754,
"grad_norm": 0.9296875,
"learning_rate": 4.7461029351582076e-05,
"loss": 2.6072,
"step": 215
},
{
"epoch": 0.18882150467136535,
"grad_norm": 1.0078125,
"learning_rate": 4.7429150583196976e-05,
"loss": 2.6458,
"step": 216
},
{
"epoch": 0.18969567830410317,
"grad_norm": 0.94140625,
"learning_rate": 4.739708377370789e-05,
"loss": 2.5746,
"step": 217
},
{
"epoch": 0.19056985193684095,
"grad_norm": 0.9375,
"learning_rate": 4.736482919195593e-05,
"loss": 2.5883,
"step": 218
},
{
"epoch": 0.19144402556957876,
"grad_norm": 0.96484375,
"learning_rate": 4.733238710835648e-05,
"loss": 2.657,
"step": 219
},
{
"epoch": 0.19231819920231655,
"grad_norm": 0.91796875,
"learning_rate": 4.729975779489689e-05,
"loss": 2.6394,
"step": 220
},
{
"epoch": 0.19319237283505436,
"grad_norm": 0.98046875,
"learning_rate": 4.7266941525134215e-05,
"loss": 2.6204,
"step": 221
},
{
"epoch": 0.19406654646779217,
"grad_norm": 0.98828125,
"learning_rate": 4.7233938574192894e-05,
"loss": 2.5254,
"step": 222
},
{
"epoch": 0.19494072010052996,
"grad_norm": 0.96484375,
"learning_rate": 4.720074921876245e-05,
"loss": 2.5567,
"step": 223
},
{
"epoch": 0.19581489373326777,
"grad_norm": 1.125,
"learning_rate": 4.716737373709521e-05,
"loss": 2.6215,
"step": 224
},
{
"epoch": 0.19668906736600558,
"grad_norm": 0.9453125,
"learning_rate": 4.713381240900394e-05,
"loss": 2.5763,
"step": 225
},
{
"epoch": 0.19756324099874337,
"grad_norm": 1.1484375,
"learning_rate": 4.710006551585946e-05,
"loss": 2.6087,
"step": 226
},
{
"epoch": 0.19843741463148118,
"grad_norm": 0.89453125,
"learning_rate": 4.7066133340588394e-05,
"loss": 2.5327,
"step": 227
},
{
"epoch": 0.199311588264219,
"grad_norm": 0.95703125,
"learning_rate": 4.703201616767067e-05,
"loss": 2.5569,
"step": 228
},
{
"epoch": 0.20018576189695678,
"grad_norm": 0.9765625,
"learning_rate": 4.699771428313722e-05,
"loss": 2.5719,
"step": 229
},
{
"epoch": 0.2010599355296946,
"grad_norm": 0.9296875,
"learning_rate": 4.696322797456757e-05,
"loss": 2.5906,
"step": 230
},
{
"epoch": 0.2019341091624324,
"grad_norm": 0.9296875,
"learning_rate": 4.69285575310874e-05,
"loss": 2.5452,
"step": 231
},
{
"epoch": 0.20280828279517019,
"grad_norm": 1.03125,
"learning_rate": 4.689370324336615e-05,
"loss": 2.6078,
"step": 232
},
{
"epoch": 0.203682456427908,
"grad_norm": 0.890625,
"learning_rate": 4.685866540361456e-05,
"loss": 2.561,
"step": 233
},
{
"epoch": 0.20455663006064578,
"grad_norm": 0.91796875,
"learning_rate": 4.682344430558222e-05,
"loss": 2.6126,
"step": 234
},
{
"epoch": 0.2054308036933836,
"grad_norm": 0.9140625,
"learning_rate": 4.6788040244555145e-05,
"loss": 2.6181,
"step": 235
},
{
"epoch": 0.2063049773261214,
"grad_norm": 0.88671875,
"learning_rate": 4.6752453517353245e-05,
"loss": 2.5554,
"step": 236
},
{
"epoch": 0.2071791509588592,
"grad_norm": 0.91796875,
"learning_rate": 4.6716684422327886e-05,
"loss": 2.5949,
"step": 237
},
{
"epoch": 0.208053324591597,
"grad_norm": 0.94140625,
"learning_rate": 4.6680733259359346e-05,
"loss": 2.5931,
"step": 238
},
{
"epoch": 0.20892749822433482,
"grad_norm": 0.8515625,
"learning_rate": 4.6644600329854325e-05,
"loss": 2.5865,
"step": 239
},
{
"epoch": 0.2098016718570726,
"grad_norm": 0.9375,
"learning_rate": 4.6608285936743445e-05,
"loss": 2.5658,
"step": 240
},
{
"epoch": 0.21067584548981041,
"grad_norm": 0.88671875,
"learning_rate": 4.657179038447862e-05,
"loss": 2.5902,
"step": 241
},
{
"epoch": 0.21155001912254823,
"grad_norm": 0.89453125,
"learning_rate": 4.653511397903063e-05,
"loss": 2.5303,
"step": 242
},
{
"epoch": 0.212424192755286,
"grad_norm": 0.890625,
"learning_rate": 4.649825702788643e-05,
"loss": 2.6264,
"step": 243
},
{
"epoch": 0.21329836638802382,
"grad_norm": 0.95703125,
"learning_rate": 4.6461219840046654e-05,
"loss": 2.5831,
"step": 244
},
{
"epoch": 0.21417254002076164,
"grad_norm": 0.90234375,
"learning_rate": 4.642400272602302e-05,
"loss": 2.6215,
"step": 245
},
{
"epoch": 0.21504671365349942,
"grad_norm": 0.921875,
"learning_rate": 4.638660599783567e-05,
"loss": 2.5877,
"step": 246
},
{
"epoch": 0.21592088728623723,
"grad_norm": 0.8984375,
"learning_rate": 4.6349029969010644e-05,
"loss": 2.5607,
"step": 247
},
{
"epoch": 0.21679506091897502,
"grad_norm": 0.90234375,
"learning_rate": 4.631127495457713e-05,
"loss": 2.5615,
"step": 248
},
{
"epoch": 0.21766923455171283,
"grad_norm": 0.9453125,
"learning_rate": 4.6273341271064965e-05,
"loss": 2.6131,
"step": 249
},
{
"epoch": 0.21854340818445064,
"grad_norm": 0.890625,
"learning_rate": 4.6235229236501845e-05,
"loss": 2.6152,
"step": 250
},
{
"epoch": 0.21941758181718843,
"grad_norm": 0.9296875,
"learning_rate": 4.619693917041076e-05,
"loss": 2.5947,
"step": 251
},
{
"epoch": 0.22029175544992624,
"grad_norm": 0.9296875,
"learning_rate": 4.615847139380728e-05,
"loss": 2.6395,
"step": 252
},
{
"epoch": 0.22116592908266405,
"grad_norm": 0.8828125,
"learning_rate": 4.611982622919683e-05,
"loss": 2.5855,
"step": 253
},
{
"epoch": 0.22204010271540184,
"grad_norm": 0.875,
"learning_rate": 4.608100400057206e-05,
"loss": 2.5098,
"step": 254
},
{
"epoch": 0.22291427634813965,
"grad_norm": 0.9296875,
"learning_rate": 4.604200503341004e-05,
"loss": 2.6061,
"step": 255
},
{
"epoch": 0.22378844998087746,
"grad_norm": 0.93359375,
"learning_rate": 4.6002829654669616e-05,
"loss": 2.5075,
"step": 256
},
{
"epoch": 0.22466262361361525,
"grad_norm": 0.89453125,
"learning_rate": 4.596347819278861e-05,
"loss": 2.5869,
"step": 257
},
{
"epoch": 0.22553679724635306,
"grad_norm": 0.92578125,
"learning_rate": 4.5923950977681084e-05,
"loss": 2.586,
"step": 258
},
{
"epoch": 0.22641097087909087,
"grad_norm": 0.9296875,
"learning_rate": 4.58842483407346e-05,
"loss": 2.5124,
"step": 259
},
{
"epoch": 0.22728514451182866,
"grad_norm": 0.88671875,
"learning_rate": 4.584437061480739e-05,
"loss": 2.5364,
"step": 260
},
{
"epoch": 0.22815931814456647,
"grad_norm": 0.93359375,
"learning_rate": 4.58043181342256e-05,
"loss": 2.5939,
"step": 261
},
{
"epoch": 0.22903349177730425,
"grad_norm": 0.96484375,
"learning_rate": 4.5764091234780504e-05,
"loss": 2.5893,
"step": 262
},
{
"epoch": 0.22990766541004207,
"grad_norm": 0.9375,
"learning_rate": 4.572369025372564e-05,
"loss": 2.5496,
"step": 263
},
{
"epoch": 0.23078183904277988,
"grad_norm": 0.95703125,
"learning_rate": 4.568311552977401e-05,
"loss": 2.6138,
"step": 264
},
{
"epoch": 0.23165601267551766,
"grad_norm": 0.90625,
"learning_rate": 4.564236740309525e-05,
"loss": 2.5724,
"step": 265
},
{
"epoch": 0.23253018630825548,
"grad_norm": 0.8984375,
"learning_rate": 4.560144621531278e-05,
"loss": 2.5762,
"step": 266
},
{
"epoch": 0.2334043599409933,
"grad_norm": 1.046875,
"learning_rate": 4.5560352309500886e-05,
"loss": 2.5781,
"step": 267
},
{
"epoch": 0.23427853357373107,
"grad_norm": 0.90625,
"learning_rate": 4.551908603018191e-05,
"loss": 2.606,
"step": 268
},
{
"epoch": 0.2351527072064689,
"grad_norm": 1.0234375,
"learning_rate": 4.547764772332333e-05,
"loss": 2.589,
"step": 269
},
{
"epoch": 0.2360268808392067,
"grad_norm": 0.91015625,
"learning_rate": 4.5436037736334894e-05,
"loss": 2.6229,
"step": 270
},
{
"epoch": 0.23690105447194448,
"grad_norm": 0.91796875,
"learning_rate": 4.539425641806562e-05,
"loss": 2.5875,
"step": 271
},
{
"epoch": 0.2377752281046823,
"grad_norm": 0.875,
"learning_rate": 4.535230411880098e-05,
"loss": 2.6023,
"step": 272
},
{
"epoch": 0.2386494017374201,
"grad_norm": 0.8828125,
"learning_rate": 4.531018119025989e-05,
"loss": 2.5965,
"step": 273
},
{
"epoch": 0.2395235753701579,
"grad_norm": 0.953125,
"learning_rate": 4.5267887985591795e-05,
"loss": 2.5359,
"step": 274
},
{
"epoch": 0.2403977490028957,
"grad_norm": 0.9140625,
"learning_rate": 4.522542485937369e-05,
"loss": 2.5603,
"step": 275
},
{
"epoch": 0.2412719226356335,
"grad_norm": 0.8515625,
"learning_rate": 4.5182792167607155e-05,
"loss": 2.6296,
"step": 276
},
{
"epoch": 0.2421460962683713,
"grad_norm": 0.91015625,
"learning_rate": 4.513999026771539e-05,
"loss": 2.5896,
"step": 277
},
{
"epoch": 0.24302026990110911,
"grad_norm": 0.8671875,
"learning_rate": 4.509701951854017e-05,
"loss": 2.5494,
"step": 278
},
{
"epoch": 0.2438944435338469,
"grad_norm": 1.1484375,
"learning_rate": 4.505388028033888e-05,
"loss": 2.6256,
"step": 279
},
{
"epoch": 0.2447686171665847,
"grad_norm": 0.88671875,
"learning_rate": 4.501057291478149e-05,
"loss": 2.6245,
"step": 280
},
{
"epoch": 0.24564279079932252,
"grad_norm": 0.94140625,
"learning_rate": 4.496709778494749e-05,
"loss": 2.5308,
"step": 281
},
{
"epoch": 0.2465169644320603,
"grad_norm": 0.8828125,
"learning_rate": 4.492345525532288e-05,
"loss": 2.6629,
"step": 282
},
{
"epoch": 0.24739113806479812,
"grad_norm": 0.95703125,
"learning_rate": 4.487964569179711e-05,
"loss": 2.4932,
"step": 283
},
{
"epoch": 0.24826531169753593,
"grad_norm": 0.94921875,
"learning_rate": 4.4835669461660004e-05,
"loss": 2.5798,
"step": 284
},
{
"epoch": 0.24913948533027372,
"grad_norm": 0.9375,
"learning_rate": 4.479152693359868e-05,
"loss": 2.6232,
"step": 285
},
{
"epoch": 0.25001365896301153,
"grad_norm": 0.94921875,
"learning_rate": 4.474721847769445e-05,
"loss": 2.5524,
"step": 286
},
{
"epoch": 0.2508878325957493,
"grad_norm": 0.9375,
"learning_rate": 4.4702744465419744e-05,
"loss": 2.6093,
"step": 287
},
{
"epoch": 0.25176200622848716,
"grad_norm": 1.265625,
"learning_rate": 4.465810526963499e-05,
"loss": 2.5971,
"step": 288
},
{
"epoch": 0.25263617986122494,
"grad_norm": 0.92578125,
"learning_rate": 4.461330126458544e-05,
"loss": 2.529,
"step": 289
},
{
"epoch": 0.2535103534939627,
"grad_norm": 0.90234375,
"learning_rate": 4.4568332825898105e-05,
"loss": 2.5475,
"step": 290
},
{
"epoch": 0.25438452712670057,
"grad_norm": 0.9140625,
"learning_rate": 4.452320033057856e-05,
"loss": 2.5431,
"step": 291
},
{
"epoch": 0.25525870075943835,
"grad_norm": 0.96875,
"learning_rate": 4.447790415700781e-05,
"loss": 2.5771,
"step": 292
},
{
"epoch": 0.25613287439217614,
"grad_norm": 0.890625,
"learning_rate": 4.4432444684939077e-05,
"loss": 2.6166,
"step": 293
},
{
"epoch": 0.257007048024914,
"grad_norm": 0.9765625,
"learning_rate": 4.438682229549466e-05,
"loss": 2.5507,
"step": 294
},
{
"epoch": 0.25788122165765176,
"grad_norm": 0.90625,
"learning_rate": 4.434103737116272e-05,
"loss": 2.5351,
"step": 295
},
{
"epoch": 0.25875539529038954,
"grad_norm": 0.90625,
"learning_rate": 4.429509029579405e-05,
"loss": 2.6678,
"step": 296
},
{
"epoch": 0.25962956892312733,
"grad_norm": 0.9765625,
"learning_rate": 4.4248981454598935e-05,
"loss": 2.5859,
"step": 297
},
{
"epoch": 0.26050374255586517,
"grad_norm": 0.8984375,
"learning_rate": 4.420271123414381e-05,
"loss": 2.5215,
"step": 298
},
{
"epoch": 0.26137791618860295,
"grad_norm": 0.8515625,
"learning_rate": 4.415628002234812e-05,
"loss": 2.5394,
"step": 299
},
{
"epoch": 0.26225208982134074,
"grad_norm": 0.9765625,
"learning_rate": 4.4109688208481015e-05,
"loss": 2.6149,
"step": 300
},
{
"epoch": 0.2631262634540786,
"grad_norm": 0.921875,
"learning_rate": 4.406293618315809e-05,
"loss": 2.5216,
"step": 301
},
{
"epoch": 0.26400043708681636,
"grad_norm": 0.875,
"learning_rate": 4.4016024338338114e-05,
"loss": 2.5536,
"step": 302
},
{
"epoch": 0.26487461071955415,
"grad_norm": 0.88671875,
"learning_rate": 4.3968953067319777e-05,
"loss": 2.5415,
"step": 303
},
{
"epoch": 0.265748784352292,
"grad_norm": 0.87890625,
"learning_rate": 4.3921722764738326e-05,
"loss": 2.5575,
"step": 304
},
{
"epoch": 0.2666229579850298,
"grad_norm": 0.86328125,
"learning_rate": 4.387433382656232e-05,
"loss": 2.4776,
"step": 305
},
{
"epoch": 0.26749713161776756,
"grad_norm": 0.91015625,
"learning_rate": 4.382678665009028e-05,
"loss": 2.5806,
"step": 306
},
{
"epoch": 0.2683713052505054,
"grad_norm": 0.84765625,
"learning_rate": 4.377908163394734e-05,
"loss": 2.5854,
"step": 307
},
{
"epoch": 0.2692454788832432,
"grad_norm": 0.87109375,
"learning_rate": 4.373121917808196e-05,
"loss": 2.5241,
"step": 308
},
{
"epoch": 0.27011965251598097,
"grad_norm": 0.91015625,
"learning_rate": 4.368319968376253e-05,
"loss": 2.4803,
"step": 309
},
{
"epoch": 0.2709938261487188,
"grad_norm": 0.84765625,
"learning_rate": 4.363502355357399e-05,
"loss": 2.5509,
"step": 310
},
{
"epoch": 0.2718679997814566,
"grad_norm": 0.8671875,
"learning_rate": 4.358669119141453e-05,
"loss": 2.5421,
"step": 311
},
{
"epoch": 0.2727421734141944,
"grad_norm": 0.88671875,
"learning_rate": 4.3538203002492104e-05,
"loss": 2.5374,
"step": 312
},
{
"epoch": 0.2736163470469322,
"grad_norm": 0.85546875,
"learning_rate": 4.348955939332111e-05,
"loss": 2.5822,
"step": 313
},
{
"epoch": 0.27449052067967,
"grad_norm": 0.87890625,
"learning_rate": 4.344076077171897e-05,
"loss": 2.5644,
"step": 314
},
{
"epoch": 0.2753646943124078,
"grad_norm": 0.86328125,
"learning_rate": 4.339180754680267e-05,
"loss": 2.6278,
"step": 315
},
{
"epoch": 0.2762388679451456,
"grad_norm": 0.86328125,
"learning_rate": 4.3342700128985345e-05,
"loss": 2.577,
"step": 316
},
{
"epoch": 0.2771130415778834,
"grad_norm": 0.84765625,
"learning_rate": 4.3293438929972894e-05,
"loss": 2.5167,
"step": 317
},
{
"epoch": 0.2779872152106212,
"grad_norm": 0.8515625,
"learning_rate": 4.324402436276046e-05,
"loss": 2.5297,
"step": 318
},
{
"epoch": 0.27886138884335904,
"grad_norm": 0.890625,
"learning_rate": 4.319445684162897e-05,
"loss": 2.58,
"step": 319
},
{
"epoch": 0.2797355624760968,
"grad_norm": 0.921875,
"learning_rate": 4.3144736782141725e-05,
"loss": 2.5503,
"step": 320
},
{
"epoch": 0.2806097361088346,
"grad_norm": 0.8984375,
"learning_rate": 4.309486460114085e-05,
"loss": 2.4978,
"step": 321
},
{
"epoch": 0.28148390974157245,
"grad_norm": 0.9140625,
"learning_rate": 4.3044840716743824e-05,
"loss": 2.5319,
"step": 322
},
{
"epoch": 0.28235808337431023,
"grad_norm": 0.890625,
"learning_rate": 4.299466554833997e-05,
"loss": 2.5353,
"step": 323
},
{
"epoch": 0.283232257007048,
"grad_norm": 0.87890625,
"learning_rate": 4.294433951658697e-05,
"loss": 2.6071,
"step": 324
},
{
"epoch": 0.2841064306397858,
"grad_norm": 0.9375,
"learning_rate": 4.289386304340727e-05,
"loss": 2.6526,
"step": 325
},
{
"epoch": 0.28498060427252364,
"grad_norm": 0.86328125,
"learning_rate": 4.284323655198462e-05,
"loss": 2.553,
"step": 326
},
{
"epoch": 0.2858547779052614,
"grad_norm": 0.9296875,
"learning_rate": 4.2792460466760485e-05,
"loss": 2.5924,
"step": 327
},
{
"epoch": 0.2867289515379992,
"grad_norm": 0.91796875,
"learning_rate": 4.274153521343046e-05,
"loss": 2.5093,
"step": 328
},
{
"epoch": 0.28760312517073705,
"grad_norm": 0.87890625,
"learning_rate": 4.269046121894077e-05,
"loss": 2.5962,
"step": 329
},
{
"epoch": 0.28847729880347484,
"grad_norm": 0.89453125,
"learning_rate": 4.2639238911484633e-05,
"loss": 2.5287,
"step": 330
},
{
"epoch": 0.2893514724362126,
"grad_norm": 0.85546875,
"learning_rate": 4.2587868720498705e-05,
"loss": 2.5151,
"step": 331
},
{
"epoch": 0.29022564606895046,
"grad_norm": 0.890625,
"learning_rate": 4.253635107665945e-05,
"loss": 2.5844,
"step": 332
},
{
"epoch": 0.29109981970168824,
"grad_norm": 0.83203125,
"learning_rate": 4.2484686411879554e-05,
"loss": 2.5545,
"step": 333
},
{
"epoch": 0.29197399333442603,
"grad_norm": 0.84375,
"learning_rate": 4.2432875159304295e-05,
"loss": 2.5029,
"step": 334
},
{
"epoch": 0.29284816696716387,
"grad_norm": 0.875,
"learning_rate": 4.2380917753307904e-05,
"loss": 2.5439,
"step": 335
},
{
"epoch": 0.29372234059990165,
"grad_norm": 0.88671875,
"learning_rate": 4.232881462948994e-05,
"loss": 2.5714,
"step": 336
},
{
"epoch": 0.29459651423263944,
"grad_norm": 0.8671875,
"learning_rate": 4.227656622467162e-05,
"loss": 2.5515,
"step": 337
},
{
"epoch": 0.2954706878653773,
"grad_norm": 0.8828125,
"learning_rate": 4.222417297689217e-05,
"loss": 2.5615,
"step": 338
},
{
"epoch": 0.29634486149811506,
"grad_norm": 0.9296875,
"learning_rate": 4.217163532540514e-05,
"loss": 2.57,
"step": 339
},
{
"epoch": 0.29721903513085285,
"grad_norm": 0.8984375,
"learning_rate": 4.211895371067474e-05,
"loss": 2.5805,
"step": 340
},
{
"epoch": 0.2980932087635907,
"grad_norm": 0.859375,
"learning_rate": 4.206612857437213e-05,
"loss": 2.6419,
"step": 341
},
{
"epoch": 0.2989673823963285,
"grad_norm": 0.8828125,
"learning_rate": 4.2013160359371736e-05,
"loss": 2.5025,
"step": 342
},
{
"epoch": 0.29984155602906626,
"grad_norm": 0.8515625,
"learning_rate": 4.19600495097475e-05,
"loss": 2.4513,
"step": 343
},
{
"epoch": 0.3007157296618041,
"grad_norm": 0.9140625,
"learning_rate": 4.1906796470769195e-05,
"loss": 2.6036,
"step": 344
},
{
"epoch": 0.3015899032945419,
"grad_norm": 0.90234375,
"learning_rate": 4.185340168889868e-05,
"loss": 2.5366,
"step": 345
},
{
"epoch": 0.30246407692727967,
"grad_norm": 0.9453125,
"learning_rate": 4.179986561178617e-05,
"loss": 2.539,
"step": 346
},
{
"epoch": 0.3033382505600175,
"grad_norm": 0.8828125,
"learning_rate": 4.1746188688266444e-05,
"loss": 2.5152,
"step": 347
},
{
"epoch": 0.3042124241927553,
"grad_norm": 0.8828125,
"learning_rate": 4.16923713683551e-05,
"loss": 2.6098,
"step": 348
},
{
"epoch": 0.3050865978254931,
"grad_norm": 0.890625,
"learning_rate": 4.163841410324482e-05,
"loss": 2.5229,
"step": 349
},
{
"epoch": 0.3059607714582309,
"grad_norm": 0.87890625,
"learning_rate": 4.158431734530154e-05,
"loss": 2.5009,
"step": 350
},
{
"epoch": 0.3068349450909687,
"grad_norm": 0.9140625,
"learning_rate": 4.153008154806067e-05,
"loss": 2.4947,
"step": 351
},
{
"epoch": 0.3077091187237065,
"grad_norm": 0.953125,
"learning_rate": 4.1475707166223296e-05,
"loss": 2.5652,
"step": 352
},
{
"epoch": 0.30858329235644427,
"grad_norm": 0.8828125,
"learning_rate": 4.142119465565238e-05,
"loss": 2.5643,
"step": 353
},
{
"epoch": 0.3094574659891821,
"grad_norm": 0.953125,
"learning_rate": 4.13665444733689e-05,
"loss": 2.5575,
"step": 354
},
{
"epoch": 0.3103316396219199,
"grad_norm": 0.8984375,
"learning_rate": 4.131175707754807e-05,
"loss": 2.4748,
"step": 355
},
{
"epoch": 0.3112058132546577,
"grad_norm": 0.96875,
"learning_rate": 4.125683292751546e-05,
"loss": 2.53,
"step": 356
},
{
"epoch": 0.3120799868873955,
"grad_norm": 0.875,
"learning_rate": 4.120177248374315e-05,
"loss": 2.5582,
"step": 357
},
{
"epoch": 0.3129541605201333,
"grad_norm": 0.90625,
"learning_rate": 4.114657620784589e-05,
"loss": 2.5842,
"step": 358
},
{
"epoch": 0.3138283341528711,
"grad_norm": 0.94140625,
"learning_rate": 4.109124456257721e-05,
"loss": 2.5279,
"step": 359
},
{
"epoch": 0.31470250778560893,
"grad_norm": 0.90234375,
"learning_rate": 4.103577801182557e-05,
"loss": 2.5657,
"step": 360
},
{
"epoch": 0.3155766814183467,
"grad_norm": 1.125,
"learning_rate": 4.098017702061039e-05,
"loss": 2.5622,
"step": 361
},
{
"epoch": 0.3164508550510845,
"grad_norm": 0.88671875,
"learning_rate": 4.0924442055078276e-05,
"loss": 2.5328,
"step": 362
},
{
"epoch": 0.31732502868382234,
"grad_norm": 0.94140625,
"learning_rate": 4.0868573582499004e-05,
"loss": 2.5514,
"step": 363
},
{
"epoch": 0.3181992023165601,
"grad_norm": 0.9453125,
"learning_rate": 4.0812572071261654e-05,
"loss": 2.5575,
"step": 364
},
{
"epoch": 0.3190733759492979,
"grad_norm": 0.89453125,
"learning_rate": 4.07564379908707e-05,
"loss": 2.5688,
"step": 365
},
{
"epoch": 0.31994754958203575,
"grad_norm": 0.9140625,
"learning_rate": 4.070017181194199e-05,
"loss": 2.5032,
"step": 366
},
{
"epoch": 0.32082172321477354,
"grad_norm": 0.86328125,
"learning_rate": 4.0643774006198907e-05,
"loss": 2.5319,
"step": 367
},
{
"epoch": 0.3216958968475113,
"grad_norm": 0.94140625,
"learning_rate": 4.058724504646834e-05,
"loss": 2.5558,
"step": 368
},
{
"epoch": 0.32257007048024916,
"grad_norm": 0.84375,
"learning_rate": 4.053058540667676e-05,
"loss": 2.5876,
"step": 369
},
{
"epoch": 0.32344424411298694,
"grad_norm": 0.875,
"learning_rate": 4.0473795561846215e-05,
"loss": 2.5354,
"step": 370
},
{
"epoch": 0.32431841774572473,
"grad_norm": 0.84375,
"learning_rate": 4.0416875988090375e-05,
"loss": 2.531,
"step": 371
},
{
"epoch": 0.32519259137846257,
"grad_norm": 0.859375,
"learning_rate": 4.035982716261053e-05,
"loss": 2.5584,
"step": 372
},
{
"epoch": 0.32606676501120035,
"grad_norm": 0.84765625,
"learning_rate": 4.030264956369157e-05,
"loss": 2.4785,
"step": 373
},
{
"epoch": 0.32694093864393814,
"grad_norm": 0.90234375,
"learning_rate": 4.0245343670698025e-05,
"loss": 2.549,
"step": 374
},
{
"epoch": 0.327815112276676,
"grad_norm": 0.83984375,
"learning_rate": 4.018790996406998e-05,
"loss": 2.4917,
"step": 375
},
{
"epoch": 0.32868928590941376,
"grad_norm": 0.8671875,
"learning_rate": 4.01303489253191e-05,
"loss": 2.4882,
"step": 376
},
{
"epoch": 0.32956345954215155,
"grad_norm": 0.8828125,
"learning_rate": 4.0072661037024596e-05,
"loss": 2.5832,
"step": 377
},
{
"epoch": 0.3304376331748894,
"grad_norm": 0.87109375,
"learning_rate": 4.0014846782829104e-05,
"loss": 2.5667,
"step": 378
},
{
"epoch": 0.3313118068076272,
"grad_norm": 0.8515625,
"learning_rate": 3.9956906647434736e-05,
"loss": 2.511,
"step": 379
},
{
"epoch": 0.33218598044036496,
"grad_norm": 0.86328125,
"learning_rate": 3.989884111659893e-05,
"loss": 2.5146,
"step": 380
},
{
"epoch": 0.33306015407310274,
"grad_norm": 0.8671875,
"learning_rate": 3.984065067713043e-05,
"loss": 2.4662,
"step": 381
},
{
"epoch": 0.3339343277058406,
"grad_norm": 0.8671875,
"learning_rate": 3.978233581688518e-05,
"loss": 2.5807,
"step": 382
},
{
"epoch": 0.33480850133857837,
"grad_norm": 0.8515625,
"learning_rate": 3.9723897024762255e-05,
"loss": 2.5095,
"step": 383
},
{
"epoch": 0.33568267497131615,
"grad_norm": 0.85546875,
"learning_rate": 3.9665334790699714e-05,
"loss": 2.5084,
"step": 384
},
{
"epoch": 0.336556848604054,
"grad_norm": 0.83984375,
"learning_rate": 3.960664960567057e-05,
"loss": 2.5447,
"step": 385
},
{
"epoch": 0.3374310222367918,
"grad_norm": 0.875,
"learning_rate": 3.95478419616786e-05,
"loss": 2.5544,
"step": 386
},
{
"epoch": 0.33830519586952956,
"grad_norm": 0.8203125,
"learning_rate": 3.948891235175425e-05,
"loss": 2.5338,
"step": 387
},
{
"epoch": 0.3391793695022674,
"grad_norm": 0.84375,
"learning_rate": 3.942986126995052e-05,
"loss": 2.5239,
"step": 388
},
{
"epoch": 0.3400535431350052,
"grad_norm": 0.86328125,
"learning_rate": 3.937068921133879e-05,
"loss": 2.5493,
"step": 389
},
{
"epoch": 0.34092771676774297,
"grad_norm": 0.80859375,
"learning_rate": 3.931139667200469e-05,
"loss": 2.4874,
"step": 390
},
{
"epoch": 0.3418018904004808,
"grad_norm": 0.83984375,
"learning_rate": 3.9251984149043917e-05,
"loss": 2.5066,
"step": 391
},
{
"epoch": 0.3426760640332186,
"grad_norm": 0.8203125,
"learning_rate": 3.919245214055812e-05,
"loss": 2.5081,
"step": 392
},
{
"epoch": 0.3435502376659564,
"grad_norm": 0.84375,
"learning_rate": 3.913280114565066e-05,
"loss": 2.5536,
"step": 393
},
{
"epoch": 0.3444244112986942,
"grad_norm": 0.8828125,
"learning_rate": 3.9073031664422444e-05,
"loss": 2.5335,
"step": 394
},
{
"epoch": 0.345298584931432,
"grad_norm": 0.83203125,
"learning_rate": 3.901314419796778e-05,
"loss": 2.4885,
"step": 395
},
{
"epoch": 0.3461727585641698,
"grad_norm": 0.890625,
"learning_rate": 3.8953139248370116e-05,
"loss": 2.5373,
"step": 396
},
{
"epoch": 0.34704693219690763,
"grad_norm": 0.8515625,
"learning_rate": 3.889301731869784e-05,
"loss": 2.563,
"step": 397
},
{
"epoch": 0.3479211058296454,
"grad_norm": 0.8515625,
"learning_rate": 3.883277891300011e-05,
"loss": 2.5089,
"step": 398
},
{
"epoch": 0.3487952794623832,
"grad_norm": 0.90234375,
"learning_rate": 3.8772424536302564e-05,
"loss": 2.5444,
"step": 399
},
{
"epoch": 0.34966945309512104,
"grad_norm": 0.8359375,
"learning_rate": 3.8711954694603126e-05,
"loss": 2.4677,
"step": 400
},
{
"epoch": 0.3505436267278588,
"grad_norm": 0.87890625,
"learning_rate": 3.865136989486776e-05,
"loss": 2.4907,
"step": 401
},
{
"epoch": 0.3514178003605966,
"grad_norm": 0.8671875,
"learning_rate": 3.8590670645026195e-05,
"loss": 2.4889,
"step": 402
},
{
"epoch": 0.35229197399333445,
"grad_norm": 0.9140625,
"learning_rate": 3.85298574539677e-05,
"loss": 2.5175,
"step": 403
},
{
"epoch": 0.35316614762607224,
"grad_norm": 0.88671875,
"learning_rate": 3.84689308315368e-05,
"loss": 2.555,
"step": 404
},
{
"epoch": 0.35404032125881,
"grad_norm": 0.96875,
"learning_rate": 3.8407891288529004e-05,
"loss": 2.4927,
"step": 405
},
{
"epoch": 0.35491449489154786,
"grad_norm": 0.81640625,
"learning_rate": 3.834673933668651e-05,
"loss": 2.4928,
"step": 406
},
{
"epoch": 0.35578866852428565,
"grad_norm": 0.86328125,
"learning_rate": 3.828547548869396e-05,
"loss": 2.5426,
"step": 407
},
{
"epoch": 0.35666284215702343,
"grad_norm": 0.8828125,
"learning_rate": 3.822410025817406e-05,
"loss": 2.5477,
"step": 408
},
{
"epoch": 0.3575370157897612,
"grad_norm": 0.890625,
"learning_rate": 3.8162614159683374e-05,
"loss": 2.5466,
"step": 409
},
{
"epoch": 0.35841118942249905,
"grad_norm": 0.8671875,
"learning_rate": 3.8101017708707906e-05,
"loss": 2.5304,
"step": 410
},
{
"epoch": 0.35928536305523684,
"grad_norm": 0.91015625,
"learning_rate": 3.8039311421658887e-05,
"loss": 2.556,
"step": 411
},
{
"epoch": 0.3601595366879746,
"grad_norm": 0.8984375,
"learning_rate": 3.797749581586835e-05,
"loss": 2.5913,
"step": 412
},
{
"epoch": 0.36103371032071246,
"grad_norm": 0.87109375,
"learning_rate": 3.7915571409584836e-05,
"loss": 2.5172,
"step": 413
},
{
"epoch": 0.36190788395345025,
"grad_norm": 0.88671875,
"learning_rate": 3.7853538721969064e-05,
"loss": 2.4756,
"step": 414
},
{
"epoch": 0.36278205758618803,
"grad_norm": 0.8359375,
"learning_rate": 3.779139827308956e-05,
"loss": 2.5278,
"step": 415
},
{
"epoch": 0.3636562312189259,
"grad_norm": 0.8671875,
"learning_rate": 3.7729150583918264e-05,
"loss": 2.4925,
"step": 416
},
{
"epoch": 0.36453040485166366,
"grad_norm": 0.90625,
"learning_rate": 3.766679617632624e-05,
"loss": 2.5038,
"step": 417
},
{
"epoch": 0.36540457848440144,
"grad_norm": 0.8203125,
"learning_rate": 3.760433557307922e-05,
"loss": 2.518,
"step": 418
},
{
"epoch": 0.3662787521171393,
"grad_norm": 0.83984375,
"learning_rate": 3.754176929783327e-05,
"loss": 2.554,
"step": 419
},
{
"epoch": 0.36715292574987707,
"grad_norm": 0.859375,
"learning_rate": 3.74790978751304e-05,
"loss": 2.5062,
"step": 420
},
{
"epoch": 0.36802709938261485,
"grad_norm": 0.8828125,
"learning_rate": 3.7416321830394144e-05,
"loss": 2.5755,
"step": 421
},
{
"epoch": 0.3689012730153527,
"grad_norm": 0.828125,
"learning_rate": 3.735344168992515e-05,
"loss": 2.5203,
"step": 422
},
{
"epoch": 0.3697754466480905,
"grad_norm": 0.859375,
"learning_rate": 3.7290457980896795e-05,
"loss": 2.4996,
"step": 423
},
{
"epoch": 0.37064962028082826,
"grad_norm": 0.859375,
"learning_rate": 3.722737123135075e-05,
"loss": 2.5625,
"step": 424
},
{
"epoch": 0.3715237939135661,
"grad_norm": 0.8359375,
"learning_rate": 3.716418197019257e-05,
"loss": 2.5665,
"step": 425
},
{
"epoch": 0.3723979675463039,
"grad_norm": 0.8515625,
"learning_rate": 3.710089072718722e-05,
"loss": 2.5188,
"step": 426
},
{
"epoch": 0.37327214117904167,
"grad_norm": 0.84375,
"learning_rate": 3.7037498032954664e-05,
"loss": 2.5166,
"step": 427
},
{
"epoch": 0.3741463148117795,
"grad_norm": 0.87890625,
"learning_rate": 3.697400441896543e-05,
"loss": 2.5166,
"step": 428
},
{
"epoch": 0.3750204884445173,
"grad_norm": 0.8515625,
"learning_rate": 3.691041041753613e-05,
"loss": 2.5436,
"step": 429
},
{
"epoch": 0.3758946620772551,
"grad_norm": 0.83203125,
"learning_rate": 3.6846716561824965e-05,
"loss": 2.5019,
"step": 430
},
{
"epoch": 0.3767688357099929,
"grad_norm": 0.8515625,
"learning_rate": 3.678292338582735e-05,
"loss": 2.5575,
"step": 431
},
{
"epoch": 0.3776430093427307,
"grad_norm": 0.8046875,
"learning_rate": 3.671903142437134e-05,
"loss": 2.5161,
"step": 432
},
{
"epoch": 0.3785171829754685,
"grad_norm": 0.84765625,
"learning_rate": 3.6655041213113184e-05,
"loss": 2.5285,
"step": 433
},
{
"epoch": 0.37939135660820633,
"grad_norm": 0.8671875,
"learning_rate": 3.659095328853288e-05,
"loss": 2.4936,
"step": 434
},
{
"epoch": 0.3802655302409441,
"grad_norm": 0.85546875,
"learning_rate": 3.652676818792958e-05,
"loss": 2.5238,
"step": 435
},
{
"epoch": 0.3811397038736819,
"grad_norm": 0.8515625,
"learning_rate": 3.646248644941716e-05,
"loss": 2.4821,
"step": 436
},
{
"epoch": 0.3820138775064197,
"grad_norm": 0.87109375,
"learning_rate": 3.6398108611919696e-05,
"loss": 2.5309,
"step": 437
},
{
"epoch": 0.3828880511391575,
"grad_norm": 0.83203125,
"learning_rate": 3.633363521516693e-05,
"loss": 2.508,
"step": 438
},
{
"epoch": 0.3837622247718953,
"grad_norm": 0.84765625,
"learning_rate": 3.626906679968974e-05,
"loss": 2.5292,
"step": 439
},
{
"epoch": 0.3846363984046331,
"grad_norm": 0.83203125,
"learning_rate": 3.6204403906815655e-05,
"loss": 2.5175,
"step": 440
},
{
"epoch": 0.38551057203737094,
"grad_norm": 0.890625,
"learning_rate": 3.613964707866424e-05,
"loss": 2.5478,
"step": 441
},
{
"epoch": 0.3863847456701087,
"grad_norm": 0.8359375,
"learning_rate": 3.607479685814261e-05,
"loss": 2.5442,
"step": 442
},
{
"epoch": 0.3872589193028465,
"grad_norm": 0.90234375,
"learning_rate": 3.600985378894086e-05,
"loss": 2.5198,
"step": 443
},
{
"epoch": 0.38813309293558435,
"grad_norm": 0.94921875,
"learning_rate": 3.594481841552753e-05,
"loss": 2.5001,
"step": 444
},
{
"epoch": 0.38900726656832213,
"grad_norm": 0.8671875,
"learning_rate": 3.5879691283144964e-05,
"loss": 2.53,
"step": 445
},
{
"epoch": 0.3898814402010599,
"grad_norm": 0.859375,
"learning_rate": 3.5814472937804865e-05,
"loss": 2.5589,
"step": 446
},
{
"epoch": 0.39075561383379775,
"grad_norm": 0.86328125,
"learning_rate": 3.574916392628359e-05,
"loss": 2.5402,
"step": 447
},
{
"epoch": 0.39162978746653554,
"grad_norm": 0.85546875,
"learning_rate": 3.5683764796117634e-05,
"loss": 2.48,
"step": 448
},
{
"epoch": 0.3925039610992733,
"grad_norm": 0.89453125,
"learning_rate": 3.561827609559905e-05,
"loss": 2.5504,
"step": 449
},
{
"epoch": 0.39337813473201116,
"grad_norm": 0.8359375,
"learning_rate": 3.55526983737708e-05,
"loss": 2.5011,
"step": 450
},
{
"epoch": 0.39425230836474895,
"grad_norm": 0.8984375,
"learning_rate": 3.54870321804222e-05,
"loss": 2.4815,
"step": 451
},
{
"epoch": 0.39512648199748673,
"grad_norm": 0.85546875,
"learning_rate": 3.5421278066084276e-05,
"loss": 2.537,
"step": 452
},
{
"epoch": 0.3960006556302246,
"grad_norm": 0.875,
"learning_rate": 3.535543658202518e-05,
"loss": 2.5111,
"step": 453
},
{
"epoch": 0.39687482926296236,
"grad_norm": 0.84375,
"learning_rate": 3.528950828024555e-05,
"loss": 2.4883,
"step": 454
},
{
"epoch": 0.39774900289570014,
"grad_norm": 0.828125,
"learning_rate": 3.522349371347387e-05,
"loss": 2.4712,
"step": 455
},
{
"epoch": 0.398623176528438,
"grad_norm": 0.84375,
"learning_rate": 3.515739343516188e-05,
"loss": 2.4872,
"step": 456
},
{
"epoch": 0.39949735016117577,
"grad_norm": 0.8203125,
"learning_rate": 3.509120799947987e-05,
"loss": 2.5711,
"step": 457
},
{
"epoch": 0.40037152379391355,
"grad_norm": 0.828125,
"learning_rate": 3.50249379613121e-05,
"loss": 2.5285,
"step": 458
},
{
"epoch": 0.4012456974266514,
"grad_norm": 0.84765625,
"learning_rate": 3.49585838762521e-05,
"loss": 2.5139,
"step": 459
},
{
"epoch": 0.4021198710593892,
"grad_norm": 0.8046875,
"learning_rate": 3.489214630059806e-05,
"loss": 2.5236,
"step": 460
},
{
"epoch": 0.40299404469212696,
"grad_norm": 0.81640625,
"learning_rate": 3.4825625791348096e-05,
"loss": 2.5336,
"step": 461
},
{
"epoch": 0.4038682183248648,
"grad_norm": 0.8515625,
"learning_rate": 3.475902290619565e-05,
"loss": 2.4917,
"step": 462
},
{
"epoch": 0.4047423919576026,
"grad_norm": 0.8359375,
"learning_rate": 3.469233820352477e-05,
"loss": 2.5423,
"step": 463
},
{
"epoch": 0.40561656559034037,
"grad_norm": 0.85546875,
"learning_rate": 3.462557224240545e-05,
"loss": 2.4924,
"step": 464
},
{
"epoch": 0.40649073922307816,
"grad_norm": 0.82421875,
"learning_rate": 3.455872558258895e-05,
"loss": 2.5107,
"step": 465
},
{
"epoch": 0.407364912855816,
"grad_norm": 0.84375,
"learning_rate": 3.449179878450308e-05,
"loss": 2.5197,
"step": 466
},
{
"epoch": 0.4082390864885538,
"grad_norm": 0.8203125,
"learning_rate": 3.442479240924749e-05,
"loss": 2.4901,
"step": 467
},
{
"epoch": 0.40911326012129157,
"grad_norm": 0.83203125,
"learning_rate": 3.4357707018589036e-05,
"loss": 2.4912,
"step": 468
},
{
"epoch": 0.4099874337540294,
"grad_norm": 0.83984375,
"learning_rate": 3.429054317495697e-05,
"loss": 2.4534,
"step": 469
},
{
"epoch": 0.4108616073867672,
"grad_norm": 0.85546875,
"learning_rate": 3.4223301441438306e-05,
"loss": 2.4801,
"step": 470
},
{
"epoch": 0.411735781019505,
"grad_norm": 0.85546875,
"learning_rate": 3.415598238177307e-05,
"loss": 2.4984,
"step": 471
},
{
"epoch": 0.4126099546522428,
"grad_norm": 0.8203125,
"learning_rate": 3.408858656034957e-05,
"loss": 2.5402,
"step": 472
},
{
"epoch": 0.4134841282849806,
"grad_norm": 0.859375,
"learning_rate": 3.4021114542199664e-05,
"loss": 2.5232,
"step": 473
},
{
"epoch": 0.4143583019177184,
"grad_norm": 0.83203125,
"learning_rate": 3.395356689299401e-05,
"loss": 2.5168,
"step": 474
},
{
"epoch": 0.4152324755504562,
"grad_norm": 0.86328125,
"learning_rate": 3.3885944179037395e-05,
"loss": 2.5563,
"step": 475
},
{
"epoch": 0.416106649183194,
"grad_norm": 0.8203125,
"learning_rate": 3.381824696726386e-05,
"loss": 2.5104,
"step": 476
},
{
"epoch": 0.4169808228159318,
"grad_norm": 0.81640625,
"learning_rate": 3.3750475825232074e-05,
"loss": 2.5002,
"step": 477
},
{
"epoch": 0.41785499644866964,
"grad_norm": 0.828125,
"learning_rate": 3.3682631321120504e-05,
"loss": 2.5262,
"step": 478
},
{
"epoch": 0.4187291700814074,
"grad_norm": 0.859375,
"learning_rate": 3.361471402372267e-05,
"loss": 2.5159,
"step": 479
},
{
"epoch": 0.4196033437141452,
"grad_norm": 0.84375,
"learning_rate": 3.3546724502442354e-05,
"loss": 2.455,
"step": 480
},
{
"epoch": 0.42047751734688305,
"grad_norm": 0.796875,
"learning_rate": 3.347866332728889e-05,
"loss": 2.4299,
"step": 481
},
{
"epoch": 0.42135169097962083,
"grad_norm": 0.90625,
"learning_rate": 3.341053106887229e-05,
"loss": 2.5159,
"step": 482
},
{
"epoch": 0.4222258646123586,
"grad_norm": 0.84765625,
"learning_rate": 3.3342328298398565e-05,
"loss": 2.4763,
"step": 483
},
{
"epoch": 0.42310003824509645,
"grad_norm": 0.83984375,
"learning_rate": 3.3274055587664856e-05,
"loss": 2.4768,
"step": 484
},
{
"epoch": 0.42397421187783424,
"grad_norm": 0.84765625,
"learning_rate": 3.320571350905466e-05,
"loss": 2.5295,
"step": 485
},
{
"epoch": 0.424848385510572,
"grad_norm": 0.83984375,
"learning_rate": 3.313730263553306e-05,
"loss": 2.4913,
"step": 486
},
{
"epoch": 0.42572255914330986,
"grad_norm": 0.8515625,
"learning_rate": 3.3068823540641886e-05,
"loss": 2.5096,
"step": 487
},
{
"epoch": 0.42659673277604765,
"grad_norm": 0.8359375,
"learning_rate": 3.300027679849492e-05,
"loss": 2.5255,
"step": 488
},
{
"epoch": 0.42747090640878543,
"grad_norm": 0.84375,
"learning_rate": 3.2931662983773106e-05,
"loss": 2.4564,
"step": 489
},
{
"epoch": 0.4283450800415233,
"grad_norm": 0.85546875,
"learning_rate": 3.286298267171969e-05,
"loss": 2.5294,
"step": 490
},
{
"epoch": 0.42921925367426106,
"grad_norm": 0.84375,
"learning_rate": 3.2794236438135405e-05,
"loss": 2.5117,
"step": 491
},
{
"epoch": 0.43009342730699884,
"grad_norm": 0.9375,
"learning_rate": 3.272542485937369e-05,
"loss": 2.4564,
"step": 492
},
{
"epoch": 0.43096760093973663,
"grad_norm": 0.828125,
"learning_rate": 3.265654851233579e-05,
"loss": 2.4361,
"step": 493
},
{
"epoch": 0.43184177457247447,
"grad_norm": 0.8125,
"learning_rate": 3.258760797446598e-05,
"loss": 2.5215,
"step": 494
},
{
"epoch": 0.43271594820521225,
"grad_norm": 0.8125,
"learning_rate": 3.251860382374668e-05,
"loss": 2.4979,
"step": 495
},
{
"epoch": 0.43359012183795004,
"grad_norm": 0.8203125,
"learning_rate": 3.244953663869365e-05,
"loss": 2.5005,
"step": 496
},
{
"epoch": 0.4344642954706879,
"grad_norm": 0.84375,
"learning_rate": 3.238040699835106e-05,
"loss": 2.5365,
"step": 497
},
{
"epoch": 0.43533846910342566,
"grad_norm": 0.82421875,
"learning_rate": 3.231121548228676e-05,
"loss": 2.5102,
"step": 498
},
{
"epoch": 0.43621264273616345,
"grad_norm": 0.82421875,
"learning_rate": 3.2241962670587314e-05,
"loss": 2.4999,
"step": 499
},
{
"epoch": 0.4370868163689013,
"grad_norm": 0.83984375,
"learning_rate": 3.2172649143853176e-05,
"loss": 2.4631,
"step": 500
},
{
"epoch": 0.43796099000163907,
"grad_norm": 0.8125,
"learning_rate": 3.210327548319382e-05,
"loss": 2.5414,
"step": 501
},
{
"epoch": 0.43883516363437686,
"grad_norm": 0.86328125,
"learning_rate": 3.203384227022291e-05,
"loss": 2.4368,
"step": 502
},
{
"epoch": 0.4397093372671147,
"grad_norm": 0.83203125,
"learning_rate": 3.196435008705332e-05,
"loss": 2.5089,
"step": 503
},
{
"epoch": 0.4405835108998525,
"grad_norm": 0.79296875,
"learning_rate": 3.1894799516292374e-05,
"loss": 2.4273,
"step": 504
},
{
"epoch": 0.44145768453259027,
"grad_norm": 0.87109375,
"learning_rate": 3.1825191141036864e-05,
"loss": 2.4994,
"step": 505
},
{
"epoch": 0.4423318581653281,
"grad_norm": 0.82421875,
"learning_rate": 3.175552554486822e-05,
"loss": 2.4675,
"step": 506
},
{
"epoch": 0.4432060317980659,
"grad_norm": 0.81640625,
"learning_rate": 3.1685803311847596e-05,
"loss": 2.4315,
"step": 507
},
{
"epoch": 0.4440802054308037,
"grad_norm": 0.83203125,
"learning_rate": 3.161602502651099e-05,
"loss": 2.5206,
"step": 508
},
{
"epoch": 0.4449543790635415,
"grad_norm": 0.85546875,
"learning_rate": 3.1546191273864314e-05,
"loss": 2.4594,
"step": 509
},
{
"epoch": 0.4458285526962793,
"grad_norm": 0.80078125,
"learning_rate": 3.14763026393785e-05,
"loss": 2.4955,
"step": 510
},
{
"epoch": 0.4467027263290171,
"grad_norm": 0.80859375,
"learning_rate": 3.140635970898462e-05,
"loss": 2.4864,
"step": 511
},
{
"epoch": 0.4475768999617549,
"grad_norm": 0.8515625,
"learning_rate": 3.133636306906895e-05,
"loss": 2.4598,
"step": 512
},
{
"epoch": 0.4484510735944927,
"grad_norm": 0.80859375,
"learning_rate": 3.126631330646802e-05,
"loss": 2.5357,
"step": 513
},
{
"epoch": 0.4493252472272305,
"grad_norm": 0.8203125,
"learning_rate": 3.1196211008463765e-05,
"loss": 2.499,
"step": 514
},
{
"epoch": 0.45019942085996834,
"grad_norm": 0.8203125,
"learning_rate": 3.112605676277855e-05,
"loss": 2.5166,
"step": 515
},
{
"epoch": 0.4510735944927061,
"grad_norm": 0.8203125,
"learning_rate": 3.105585115757027e-05,
"loss": 2.4977,
"step": 516
},
{
"epoch": 0.4519477681254439,
"grad_norm": 0.80078125,
"learning_rate": 3.098559478142739e-05,
"loss": 2.48,
"step": 517
},
{
"epoch": 0.45282194175818175,
"grad_norm": 0.83984375,
"learning_rate": 3.091528822336405e-05,
"loss": 2.5027,
"step": 518
},
{
"epoch": 0.45369611539091953,
"grad_norm": 0.8125,
"learning_rate": 3.084493207281507e-05,
"loss": 2.4363,
"step": 519
},
{
"epoch": 0.4545702890236573,
"grad_norm": 0.8203125,
"learning_rate": 3.077452691963108e-05,
"loss": 2.4709,
"step": 520
},
{
"epoch": 0.4554444626563951,
"grad_norm": 0.7890625,
"learning_rate": 3.0704073354073524e-05,
"loss": 2.4589,
"step": 521
},
{
"epoch": 0.45631863628913294,
"grad_norm": 0.84765625,
"learning_rate": 3.063357196680969e-05,
"loss": 2.5196,
"step": 522
},
{
"epoch": 0.4571928099218707,
"grad_norm": 0.796875,
"learning_rate": 3.056302334890786e-05,
"loss": 2.4767,
"step": 523
},
{
"epoch": 0.4580669835546085,
"grad_norm": 0.796875,
"learning_rate": 3.0492428091832235e-05,
"loss": 2.5096,
"step": 524
},
{
"epoch": 0.45894115718734635,
"grad_norm": 0.80078125,
"learning_rate": 3.0421786787438046e-05,
"loss": 2.472,
"step": 525
},
{
"epoch": 0.45981533082008413,
"grad_norm": 0.8203125,
"learning_rate": 3.0351100027966576e-05,
"loss": 2.4269,
"step": 526
},
{
"epoch": 0.4606895044528219,
"grad_norm": 0.78515625,
"learning_rate": 3.028036840604019e-05,
"loss": 2.4802,
"step": 527
},
{
"epoch": 0.46156367808555976,
"grad_norm": 0.796875,
"learning_rate": 3.0209592514657365e-05,
"loss": 2.4102,
"step": 528
},
{
"epoch": 0.46243785171829754,
"grad_norm": 0.875,
"learning_rate": 3.0138772947187743e-05,
"loss": 2.5099,
"step": 529
},
{
"epoch": 0.46331202535103533,
"grad_norm": 0.8203125,
"learning_rate": 3.006791029736711e-05,
"loss": 2.5259,
"step": 530
},
{
"epoch": 0.46418619898377317,
"grad_norm": 0.80859375,
"learning_rate": 2.999700515929247e-05,
"loss": 2.3805,
"step": 531
},
{
"epoch": 0.46506037261651095,
"grad_norm": 0.80859375,
"learning_rate": 2.9926058127417018e-05,
"loss": 2.4986,
"step": 532
},
{
"epoch": 0.46593454624924874,
"grad_norm": 0.8515625,
"learning_rate": 2.9855069796545186e-05,
"loss": 2.5136,
"step": 533
},
{
"epoch": 0.4668087198819866,
"grad_norm": 0.84375,
"learning_rate": 2.9784040761827658e-05,
"loss": 2.4745,
"step": 534
},
{
"epoch": 0.46768289351472436,
"grad_norm": 0.8125,
"learning_rate": 2.9712971618756364e-05,
"loss": 2.4878,
"step": 535
},
{
"epoch": 0.46855706714746215,
"grad_norm": 0.8125,
"learning_rate": 2.9641862963159478e-05,
"loss": 2.4917,
"step": 536
},
{
"epoch": 0.4694312407802,
"grad_norm": 0.828125,
"learning_rate": 2.9570715391196463e-05,
"loss": 2.4364,
"step": 537
},
{
"epoch": 0.4703054144129378,
"grad_norm": 0.80078125,
"learning_rate": 2.9499529499353024e-05,
"loss": 2.4615,
"step": 538
},
{
"epoch": 0.47117958804567556,
"grad_norm": 0.8515625,
"learning_rate": 2.942830588443615e-05,
"loss": 2.526,
"step": 539
},
{
"epoch": 0.4720537616784134,
"grad_norm": 0.82421875,
"learning_rate": 2.935704514356909e-05,
"loss": 2.5232,
"step": 540
},
{
"epoch": 0.4729279353111512,
"grad_norm": 0.828125,
"learning_rate": 2.9285747874186342e-05,
"loss": 2.47,
"step": 541
},
{
"epoch": 0.47380210894388897,
"grad_norm": 0.80859375,
"learning_rate": 2.9214414674028658e-05,
"loss": 2.5342,
"step": 542
},
{
"epoch": 0.4746762825766268,
"grad_norm": 0.8203125,
"learning_rate": 2.9143046141138015e-05,
"loss": 2.5103,
"step": 543
},
{
"epoch": 0.4755504562093646,
"grad_norm": 0.8203125,
"learning_rate": 2.9071642873852612e-05,
"loss": 2.4559,
"step": 544
},
{
"epoch": 0.4764246298421024,
"grad_norm": 0.8359375,
"learning_rate": 2.900020547080188e-05,
"loss": 2.5457,
"step": 545
},
{
"epoch": 0.4772988034748402,
"grad_norm": 0.85546875,
"learning_rate": 2.8928734530901403e-05,
"loss": 2.5192,
"step": 546
},
{
"epoch": 0.478172977107578,
"grad_norm": 0.796875,
"learning_rate": 2.8857230653347945e-05,
"loss": 2.414,
"step": 547
},
{
"epoch": 0.4790471507403158,
"grad_norm": 0.8125,
"learning_rate": 2.878569443761442e-05,
"loss": 2.5149,
"step": 548
},
{
"epoch": 0.47992132437305357,
"grad_norm": 0.7890625,
"learning_rate": 2.871412648344485e-05,
"loss": 2.412,
"step": 549
},
{
"epoch": 0.4807954980057914,
"grad_norm": 0.81640625,
"learning_rate": 2.8642527390849326e-05,
"loss": 2.4455,
"step": 550
},
{
"epoch": 0.4816696716385292,
"grad_norm": 0.91796875,
"learning_rate": 2.8570897760099042e-05,
"loss": 2.4805,
"step": 551
},
{
"epoch": 0.482543845271267,
"grad_norm": 0.80078125,
"learning_rate": 2.849923819172117e-05,
"loss": 2.4148,
"step": 552
},
{
"epoch": 0.4834180189040048,
"grad_norm": 0.8203125,
"learning_rate": 2.8427549286493904e-05,
"loss": 2.4873,
"step": 553
},
{
"epoch": 0.4842921925367426,
"grad_norm": 0.8671875,
"learning_rate": 2.8355831645441388e-05,
"loss": 2.4999,
"step": 554
},
{
"epoch": 0.4851663661694804,
"grad_norm": 0.83203125,
"learning_rate": 2.8284085869828665e-05,
"loss": 2.527,
"step": 555
},
{
"epoch": 0.48604053980221823,
"grad_norm": 0.80859375,
"learning_rate": 2.821231256115666e-05,
"loss": 2.4385,
"step": 556
},
{
"epoch": 0.486914713434956,
"grad_norm": 0.84375,
"learning_rate": 2.8140512321157142e-05,
"loss": 2.5412,
"step": 557
},
{
"epoch": 0.4877888870676938,
"grad_norm": 0.8515625,
"learning_rate": 2.8068685751787636e-05,
"loss": 2.5424,
"step": 558
},
{
"epoch": 0.48866306070043164,
"grad_norm": 0.84375,
"learning_rate": 2.799683345522644e-05,
"loss": 2.5117,
"step": 559
},
{
"epoch": 0.4895372343331694,
"grad_norm": 0.76953125,
"learning_rate": 2.792495603386753e-05,
"loss": 2.4806,
"step": 560
},
{
"epoch": 0.4904114079659072,
"grad_norm": 0.85546875,
"learning_rate": 2.7853054090315505e-05,
"loss": 2.5502,
"step": 561
},
{
"epoch": 0.49128558159864505,
"grad_norm": 1.0078125,
"learning_rate": 2.778112822738059e-05,
"loss": 2.4464,
"step": 562
},
{
"epoch": 0.49215975523138283,
"grad_norm": 0.8046875,
"learning_rate": 2.770917904807352e-05,
"loss": 2.4851,
"step": 563
},
{
"epoch": 0.4930339288641206,
"grad_norm": 0.80859375,
"learning_rate": 2.7637207155600497e-05,
"loss": 2.5079,
"step": 564
},
{
"epoch": 0.49390810249685846,
"grad_norm": 0.8359375,
"learning_rate": 2.756521315335818e-05,
"loss": 2.5144,
"step": 565
},
{
"epoch": 0.49478227612959624,
"grad_norm": 0.83203125,
"learning_rate": 2.7493197644928563e-05,
"loss": 2.5332,
"step": 566
},
{
"epoch": 0.49565644976233403,
"grad_norm": 0.8203125,
"learning_rate": 2.742116123407396e-05,
"loss": 2.5106,
"step": 567
},
{
"epoch": 0.49653062339507187,
"grad_norm": 0.80859375,
"learning_rate": 2.7349104524731916e-05,
"loss": 2.5031,
"step": 568
},
{
"epoch": 0.49740479702780965,
"grad_norm": 0.83203125,
"learning_rate": 2.7277028121010162e-05,
"loss": 2.4668,
"step": 569
},
{
"epoch": 0.49827897066054744,
"grad_norm": 0.85546875,
"learning_rate": 2.720493262718153e-05,
"loss": 2.5557,
"step": 570
},
{
"epoch": 0.4991531442932853,
"grad_norm": 0.7890625,
"learning_rate": 2.7132818647678916e-05,
"loss": 2.4921,
"step": 571
},
{
"epoch": 0.5000273179260231,
"grad_norm": 0.8125,
"learning_rate": 2.7060686787090182e-05,
"loss": 2.496,
"step": 572
},
{
"epoch": 0.5009014915587608,
"grad_norm": 0.80859375,
"learning_rate": 2.6988537650153107e-05,
"loss": 2.511,
"step": 573
},
{
"epoch": 0.5017756651914986,
"grad_norm": 0.828125,
"learning_rate": 2.691637184175031e-05,
"loss": 2.5194,
"step": 574
},
{
"epoch": 0.5026498388242364,
"grad_norm": 0.80078125,
"learning_rate": 2.6844189966904192e-05,
"loss": 2.4826,
"step": 575
},
{
"epoch": 0.5035240124569743,
"grad_norm": 0.828125,
"learning_rate": 2.6771992630771824e-05,
"loss": 2.4936,
"step": 576
},
{
"epoch": 0.5043981860897121,
"grad_norm": 0.80078125,
"learning_rate": 2.6699780438639925e-05,
"loss": 2.5083,
"step": 577
},
{
"epoch": 0.5052723597224499,
"grad_norm": 0.81640625,
"learning_rate": 2.6627553995919764e-05,
"loss": 2.4806,
"step": 578
},
{
"epoch": 0.5061465333551877,
"grad_norm": 0.81640625,
"learning_rate": 2.6555313908142053e-05,
"loss": 2.5227,
"step": 579
},
{
"epoch": 0.5070207069879255,
"grad_norm": 0.82421875,
"learning_rate": 2.648306078095194e-05,
"loss": 2.4796,
"step": 580
},
{
"epoch": 0.5078948806206632,
"grad_norm": 0.81640625,
"learning_rate": 2.6410795220103877e-05,
"loss": 2.4873,
"step": 581
},
{
"epoch": 0.5087690542534011,
"grad_norm": 0.8125,
"learning_rate": 2.6338517831456555e-05,
"loss": 2.5188,
"step": 582
},
{
"epoch": 0.5096432278861389,
"grad_norm": 0.80078125,
"learning_rate": 2.6266229220967818e-05,
"loss": 2.461,
"step": 583
},
{
"epoch": 0.5105174015188767,
"grad_norm": 0.82421875,
"learning_rate": 2.619392999468962e-05,
"loss": 2.5317,
"step": 584
},
{
"epoch": 0.5113915751516145,
"grad_norm": 0.83203125,
"learning_rate": 2.6121620758762877e-05,
"loss": 2.5001,
"step": 585
},
{
"epoch": 0.5122657487843523,
"grad_norm": 0.80078125,
"learning_rate": 2.604930211941245e-05,
"loss": 2.5155,
"step": 586
},
{
"epoch": 0.51313992241709,
"grad_norm": 0.80859375,
"learning_rate": 2.5976974682942046e-05,
"loss": 2.4995,
"step": 587
},
{
"epoch": 0.514014096049828,
"grad_norm": 0.80859375,
"learning_rate": 2.5904639055729092e-05,
"loss": 2.4771,
"step": 588
},
{
"epoch": 0.5148882696825657,
"grad_norm": 0.87109375,
"learning_rate": 2.5832295844219696e-05,
"loss": 2.4807,
"step": 589
},
{
"epoch": 0.5157624433153035,
"grad_norm": 0.82421875,
"learning_rate": 2.5759945654923575e-05,
"loss": 2.4858,
"step": 590
},
{
"epoch": 0.5166366169480413,
"grad_norm": 0.80859375,
"learning_rate": 2.5687589094408908e-05,
"loss": 2.4595,
"step": 591
},
{
"epoch": 0.5175107905807791,
"grad_norm": 3.796875,
"learning_rate": 2.5615226769297325e-05,
"loss": 2.5661,
"step": 592
},
{
"epoch": 0.5183849642135169,
"grad_norm": 0.84375,
"learning_rate": 2.554285928625877e-05,
"loss": 2.5154,
"step": 593
},
{
"epoch": 0.5192591378462547,
"grad_norm": 0.83984375,
"learning_rate": 2.5470487252006414e-05,
"loss": 2.4824,
"step": 594
},
{
"epoch": 0.5201333114789926,
"grad_norm": 0.81640625,
"learning_rate": 2.539811127329161e-05,
"loss": 2.4549,
"step": 595
},
{
"epoch": 0.5210074851117303,
"grad_norm": 0.81640625,
"learning_rate": 2.5325731956898767e-05,
"loss": 2.438,
"step": 596
},
{
"epoch": 0.5218816587444681,
"grad_norm": 0.82421875,
"learning_rate": 2.5253349909640278e-05,
"loss": 2.4597,
"step": 597
},
{
"epoch": 0.5227558323772059,
"grad_norm": 0.8203125,
"learning_rate": 2.518096573835143e-05,
"loss": 2.5094,
"step": 598
},
{
"epoch": 0.5236300060099437,
"grad_norm": 0.82421875,
"learning_rate": 2.510858004988533e-05,
"loss": 2.5704,
"step": 599
},
{
"epoch": 0.5245041796426815,
"grad_norm": 0.82421875,
"learning_rate": 2.5036193451107776e-05,
"loss": 2.4547,
"step": 600
},
{
"epoch": 0.5253783532754194,
"grad_norm": 0.8125,
"learning_rate": 2.4963806548892233e-05,
"loss": 2.5035,
"step": 601
},
{
"epoch": 0.5262525269081572,
"grad_norm": 0.8046875,
"learning_rate": 2.489141995011468e-05,
"loss": 2.4283,
"step": 602
},
{
"epoch": 0.5271267005408949,
"grad_norm": 0.84375,
"learning_rate": 2.4819034261648573e-05,
"loss": 2.472,
"step": 603
},
{
"epoch": 0.5280008741736327,
"grad_norm": 0.8359375,
"learning_rate": 2.474665009035973e-05,
"loss": 2.4643,
"step": 604
},
{
"epoch": 0.5288750478063705,
"grad_norm": 0.84375,
"learning_rate": 2.4674268043101242e-05,
"loss": 2.5151,
"step": 605
},
{
"epoch": 0.5297492214391083,
"grad_norm": 0.8359375,
"learning_rate": 2.4601888726708393e-05,
"loss": 2.5029,
"step": 606
},
{
"epoch": 0.5306233950718462,
"grad_norm": 0.8359375,
"learning_rate": 2.4529512747993595e-05,
"loss": 2.4279,
"step": 607
},
{
"epoch": 0.531497568704584,
"grad_norm": 0.82421875,
"learning_rate": 2.4457140713741237e-05,
"loss": 2.4896,
"step": 608
},
{
"epoch": 0.5323717423373218,
"grad_norm": 0.81640625,
"learning_rate": 2.4384773230702674e-05,
"loss": 2.5096,
"step": 609
},
{
"epoch": 0.5332459159700595,
"grad_norm": 0.81640625,
"learning_rate": 2.43124109055911e-05,
"loss": 2.5153,
"step": 610
},
{
"epoch": 0.5341200896027973,
"grad_norm": 0.81640625,
"learning_rate": 2.4240054345076438e-05,
"loss": 2.4421,
"step": 611
},
{
"epoch": 0.5349942632355351,
"grad_norm": 0.82421875,
"learning_rate": 2.416770415578031e-05,
"loss": 2.5096,
"step": 612
},
{
"epoch": 0.535868436868273,
"grad_norm": 0.79296875,
"learning_rate": 2.4095360944270917e-05,
"loss": 2.5204,
"step": 613
},
{
"epoch": 0.5367426105010108,
"grad_norm": 0.7890625,
"learning_rate": 2.4023025317057963e-05,
"loss": 2.4526,
"step": 614
},
{
"epoch": 0.5376167841337486,
"grad_norm": 0.828125,
"learning_rate": 2.3950697880587547e-05,
"loss": 2.46,
"step": 615
},
{
"epoch": 0.5384909577664864,
"grad_norm": 0.80859375,
"learning_rate": 2.3878379241237136e-05,
"loss": 2.5201,
"step": 616
},
{
"epoch": 0.5393651313992242,
"grad_norm": 0.77734375,
"learning_rate": 2.3806070005310392e-05,
"loss": 2.441,
"step": 617
},
{
"epoch": 0.5402393050319619,
"grad_norm": 0.8203125,
"learning_rate": 2.3733770779032184e-05,
"loss": 2.483,
"step": 618
},
{
"epoch": 0.5411134786646997,
"grad_norm": 0.8125,
"learning_rate": 2.366148216854345e-05,
"loss": 2.4989,
"step": 619
},
{
"epoch": 0.5419876522974376,
"grad_norm": 0.85546875,
"learning_rate": 2.3589204779896125e-05,
"loss": 2.5297,
"step": 620
},
{
"epoch": 0.5428618259301754,
"grad_norm": 0.81640625,
"learning_rate": 2.3516939219048058e-05,
"loss": 2.5281,
"step": 621
},
{
"epoch": 0.5437359995629132,
"grad_norm": 0.80859375,
"learning_rate": 2.344468609185796e-05,
"loss": 2.519,
"step": 622
},
{
"epoch": 0.544610173195651,
"grad_norm": 0.81640625,
"learning_rate": 2.3372446004080252e-05,
"loss": 2.4291,
"step": 623
},
{
"epoch": 0.5454843468283888,
"grad_norm": 0.8046875,
"learning_rate": 2.3300219561360077e-05,
"loss": 2.4963,
"step": 624
},
{
"epoch": 0.5463585204611265,
"grad_norm": 0.796875,
"learning_rate": 2.3228007369228178e-05,
"loss": 2.5143,
"step": 625
},
{
"epoch": 0.5472326940938644,
"grad_norm": 0.80859375,
"learning_rate": 2.3155810033095814e-05,
"loss": 2.4494,
"step": 626
},
{
"epoch": 0.5481068677266022,
"grad_norm": 0.80859375,
"learning_rate": 2.308362815824969e-05,
"loss": 2.4042,
"step": 627
},
{
"epoch": 0.54898104135934,
"grad_norm": 0.84375,
"learning_rate": 2.3011462349846905e-05,
"loss": 2.5261,
"step": 628
},
{
"epoch": 0.5498552149920778,
"grad_norm": 0.8203125,
"learning_rate": 2.293931321290983e-05,
"loss": 2.4527,
"step": 629
},
{
"epoch": 0.5507293886248156,
"grad_norm": 0.7890625,
"learning_rate": 2.2867181352321093e-05,
"loss": 2.5016,
"step": 630
},
{
"epoch": 0.5516035622575534,
"grad_norm": 0.859375,
"learning_rate": 2.2795067372818473e-05,
"loss": 2.4918,
"step": 631
},
{
"epoch": 0.5524777358902913,
"grad_norm": 0.83984375,
"learning_rate": 2.272297187898984e-05,
"loss": 2.4603,
"step": 632
},
{
"epoch": 0.553351909523029,
"grad_norm": 0.83984375,
"learning_rate": 2.2650895475268086e-05,
"loss": 2.4828,
"step": 633
},
{
"epoch": 0.5542260831557668,
"grad_norm": 0.828125,
"learning_rate": 2.257883876592604e-05,
"loss": 2.5157,
"step": 634
},
{
"epoch": 0.5551002567885046,
"grad_norm": 0.8359375,
"learning_rate": 2.2506802355071443e-05,
"loss": 2.5095,
"step": 635
},
{
"epoch": 0.5559744304212424,
"grad_norm": 0.78515625,
"learning_rate": 2.2434786846641824e-05,
"loss": 2.5183,
"step": 636
},
{
"epoch": 0.5568486040539802,
"grad_norm": 0.796875,
"learning_rate": 2.2362792844399505e-05,
"loss": 2.4203,
"step": 637
},
{
"epoch": 0.5577227776867181,
"grad_norm": 0.8203125,
"learning_rate": 2.2290820951926487e-05,
"loss": 2.4516,
"step": 638
},
{
"epoch": 0.5585969513194559,
"grad_norm": 0.83984375,
"learning_rate": 2.221887177261941e-05,
"loss": 2.5421,
"step": 639
},
{
"epoch": 0.5594711249521936,
"grad_norm": 0.81640625,
"learning_rate": 2.214694590968449e-05,
"loss": 2.4276,
"step": 640
},
{
"epoch": 0.5603452985849314,
"grad_norm": 0.82421875,
"learning_rate": 2.2075043966132484e-05,
"loss": 2.4471,
"step": 641
},
{
"epoch": 0.5612194722176692,
"grad_norm": 0.80078125,
"learning_rate": 2.2003166544773567e-05,
"loss": 2.5014,
"step": 642
},
{
"epoch": 0.562093645850407,
"grad_norm": 0.81640625,
"learning_rate": 2.1931314248212366e-05,
"loss": 2.4937,
"step": 643
},
{
"epoch": 0.5629678194831449,
"grad_norm": 0.79296875,
"learning_rate": 2.1859487678842864e-05,
"loss": 2.5088,
"step": 644
},
{
"epoch": 0.5638419931158827,
"grad_norm": 0.84375,
"learning_rate": 2.1787687438843344e-05,
"loss": 2.5142,
"step": 645
},
{
"epoch": 0.5647161667486205,
"grad_norm": 0.796875,
"learning_rate": 2.1715914130171337e-05,
"loss": 2.4418,
"step": 646
},
{
"epoch": 0.5655903403813582,
"grad_norm": 0.79296875,
"learning_rate": 2.164416835455862e-05,
"loss": 2.4465,
"step": 647
},
{
"epoch": 0.566464514014096,
"grad_norm": 0.828125,
"learning_rate": 2.1572450713506098e-05,
"loss": 2.4755,
"step": 648
},
{
"epoch": 0.5673386876468338,
"grad_norm": 0.80078125,
"learning_rate": 2.1500761808278834e-05,
"loss": 2.4652,
"step": 649
},
{
"epoch": 0.5682128612795716,
"grad_norm": 0.79296875,
"learning_rate": 2.1429102239900967e-05,
"loss": 2.4644,
"step": 650
},
{
"epoch": 0.5690870349123095,
"grad_norm": 0.8203125,
"learning_rate": 2.1357472609150676e-05,
"loss": 2.4921,
"step": 651
},
{
"epoch": 0.5699612085450473,
"grad_norm": 0.80078125,
"learning_rate": 2.128587351655516e-05,
"loss": 2.4701,
"step": 652
},
{
"epoch": 0.5708353821777851,
"grad_norm": 0.8203125,
"learning_rate": 2.1214305562385592e-05,
"loss": 2.4611,
"step": 653
},
{
"epoch": 0.5717095558105229,
"grad_norm": 0.8046875,
"learning_rate": 2.1142769346652064e-05,
"loss": 2.4365,
"step": 654
},
{
"epoch": 0.5725837294432606,
"grad_norm": 0.79296875,
"learning_rate": 2.1071265469098607e-05,
"loss": 2.4008,
"step": 655
},
{
"epoch": 0.5734579030759984,
"grad_norm": 0.80859375,
"learning_rate": 2.0999794529198124e-05,
"loss": 2.394,
"step": 656
},
{
"epoch": 0.5743320767087363,
"grad_norm": 0.8046875,
"learning_rate": 2.0928357126147387e-05,
"loss": 2.5286,
"step": 657
},
{
"epoch": 0.5752062503414741,
"grad_norm": 1.078125,
"learning_rate": 2.0856953858861995e-05,
"loss": 2.4543,
"step": 658
},
{
"epoch": 0.5760804239742119,
"grad_norm": 0.81640625,
"learning_rate": 2.078558532597135e-05,
"loss": 2.4641,
"step": 659
},
{
"epoch": 0.5769545976069497,
"grad_norm": 0.84375,
"learning_rate": 2.0714252125813667e-05,
"loss": 2.5054,
"step": 660
},
{
"epoch": 0.5778287712396875,
"grad_norm": 0.79296875,
"learning_rate": 2.0642954856430913e-05,
"loss": 2.5043,
"step": 661
},
{
"epoch": 0.5787029448724252,
"grad_norm": 0.80078125,
"learning_rate": 2.057169411556385e-05,
"loss": 2.5082,
"step": 662
},
{
"epoch": 0.5795771185051631,
"grad_norm": 0.78515625,
"learning_rate": 2.0500470500646978e-05,
"loss": 2.4855,
"step": 663
},
{
"epoch": 0.5804512921379009,
"grad_norm": 0.8125,
"learning_rate": 2.0429284608803546e-05,
"loss": 2.5129,
"step": 664
},
{
"epoch": 0.5813254657706387,
"grad_norm": 0.8125,
"learning_rate": 2.0358137036840528e-05,
"loss": 2.4905,
"step": 665
},
{
"epoch": 0.5821996394033765,
"grad_norm": 0.78515625,
"learning_rate": 2.0287028381243645e-05,
"loss": 2.4457,
"step": 666
},
{
"epoch": 0.5830738130361143,
"grad_norm": 0.8046875,
"learning_rate": 2.0215959238172345e-05,
"loss": 2.4677,
"step": 667
},
{
"epoch": 0.5839479866688521,
"grad_norm": 0.78515625,
"learning_rate": 2.0144930203454816e-05,
"loss": 2.4793,
"step": 668
},
{
"epoch": 0.58482216030159,
"grad_norm": 0.80078125,
"learning_rate": 2.0073941872582984e-05,
"loss": 2.4967,
"step": 669
},
{
"epoch": 0.5856963339343277,
"grad_norm": 0.78125,
"learning_rate": 2.0002994840707534e-05,
"loss": 2.4472,
"step": 670
},
{
"epoch": 0.5865705075670655,
"grad_norm": 0.81640625,
"learning_rate": 1.9932089702632897e-05,
"loss": 2.5045,
"step": 671
},
{
"epoch": 0.5874446811998033,
"grad_norm": 0.7734375,
"learning_rate": 1.986122705281227e-05,
"loss": 2.4368,
"step": 672
},
{
"epoch": 0.5883188548325411,
"grad_norm": 0.796875,
"learning_rate": 1.979040748534264e-05,
"loss": 2.4439,
"step": 673
},
{
"epoch": 0.5891930284652789,
"grad_norm": 0.796875,
"learning_rate": 1.9719631593959816e-05,
"loss": 2.4486,
"step": 674
},
{
"epoch": 0.5900672020980167,
"grad_norm": 0.80859375,
"learning_rate": 1.9648899972033426e-05,
"loss": 2.4085,
"step": 675
},
{
"epoch": 0.5909413757307546,
"grad_norm": 0.796875,
"learning_rate": 1.9578213212561953e-05,
"loss": 2.3664,
"step": 676
},
{
"epoch": 0.5918155493634923,
"grad_norm": 0.828125,
"learning_rate": 1.950757190816777e-05,
"loss": 2.5016,
"step": 677
},
{
"epoch": 0.5926897229962301,
"grad_norm": 0.79296875,
"learning_rate": 1.9436976651092144e-05,
"loss": 2.48,
"step": 678
},
{
"epoch": 0.5935638966289679,
"grad_norm": 0.7734375,
"learning_rate": 1.9366428033190313e-05,
"loss": 2.4471,
"step": 679
},
{
"epoch": 0.5944380702617057,
"grad_norm": 0.7890625,
"learning_rate": 1.929592664592649e-05,
"loss": 2.4438,
"step": 680
},
{
"epoch": 0.5953122438944435,
"grad_norm": 0.80078125,
"learning_rate": 1.9225473080368916e-05,
"loss": 2.4818,
"step": 681
},
{
"epoch": 0.5961864175271814,
"grad_norm": 0.79296875,
"learning_rate": 1.9155067927184926e-05,
"loss": 2.5117,
"step": 682
},
{
"epoch": 0.5970605911599192,
"grad_norm": 0.8203125,
"learning_rate": 1.9084711776635958e-05,
"loss": 2.4846,
"step": 683
},
{
"epoch": 0.597934764792657,
"grad_norm": 0.80078125,
"learning_rate": 1.901440521857261e-05,
"loss": 2.4458,
"step": 684
},
{
"epoch": 0.5988089384253947,
"grad_norm": 0.80078125,
"learning_rate": 1.894414884242974e-05,
"loss": 2.517,
"step": 685
},
{
"epoch": 0.5996831120581325,
"grad_norm": 0.796875,
"learning_rate": 1.8873943237221453e-05,
"loss": 2.4851,
"step": 686
},
{
"epoch": 0.6005572856908703,
"grad_norm": 0.7890625,
"learning_rate": 1.880378899153624e-05,
"loss": 2.4821,
"step": 687
},
{
"epoch": 0.6014314593236082,
"grad_norm": 0.79296875,
"learning_rate": 1.8733686693531985e-05,
"loss": 2.4555,
"step": 688
},
{
"epoch": 0.602305632956346,
"grad_norm": 0.78515625,
"learning_rate": 1.8663636930931063e-05,
"loss": 2.5098,
"step": 689
},
{
"epoch": 0.6031798065890838,
"grad_norm": 0.77734375,
"learning_rate": 1.859364029101538e-05,
"loss": 2.4382,
"step": 690
},
{
"epoch": 0.6040539802218216,
"grad_norm": 0.80859375,
"learning_rate": 1.8523697360621504e-05,
"loss": 2.5444,
"step": 691
},
{
"epoch": 0.6049281538545593,
"grad_norm": 0.80078125,
"learning_rate": 1.8453808726135695e-05,
"loss": 2.5187,
"step": 692
},
{
"epoch": 0.6058023274872971,
"grad_norm": 0.78125,
"learning_rate": 1.838397497348901e-05,
"loss": 2.5029,
"step": 693
},
{
"epoch": 0.606676501120035,
"grad_norm": 0.83203125,
"learning_rate": 1.8314196688152403e-05,
"loss": 2.5432,
"step": 694
},
{
"epoch": 0.6075506747527728,
"grad_norm": 0.78125,
"learning_rate": 1.8244474455131792e-05,
"loss": 2.4449,
"step": 695
},
{
"epoch": 0.6084248483855106,
"grad_norm": 0.80078125,
"learning_rate": 1.8174808858963145e-05,
"loss": 2.4266,
"step": 696
},
{
"epoch": 0.6092990220182484,
"grad_norm": 0.81640625,
"learning_rate": 1.810520048370763e-05,
"loss": 2.5512,
"step": 697
},
{
"epoch": 0.6101731956509862,
"grad_norm": 0.8125,
"learning_rate": 1.8035649912946684e-05,
"loss": 2.5539,
"step": 698
},
{
"epoch": 0.6110473692837239,
"grad_norm": 0.80078125,
"learning_rate": 1.7966157729777095e-05,
"loss": 2.4313,
"step": 699
},
{
"epoch": 0.6119215429164618,
"grad_norm": 0.8125,
"learning_rate": 1.7896724516806175e-05,
"loss": 2.4875,
"step": 700
},
{
"epoch": 0.6127957165491996,
"grad_norm": 0.79296875,
"learning_rate": 1.782735085614683e-05,
"loss": 2.4663,
"step": 701
},
{
"epoch": 0.6136698901819374,
"grad_norm": 0.80078125,
"learning_rate": 1.77580373294127e-05,
"loss": 2.4446,
"step": 702
},
{
"epoch": 0.6145440638146752,
"grad_norm": 0.80859375,
"learning_rate": 1.7688784517713248e-05,
"loss": 2.5128,
"step": 703
},
{
"epoch": 0.615418237447413,
"grad_norm": 0.8046875,
"learning_rate": 1.7619593001648947e-05,
"loss": 2.3915,
"step": 704
},
{
"epoch": 0.6162924110801508,
"grad_norm": 0.77734375,
"learning_rate": 1.755046336130636e-05,
"loss": 2.4668,
"step": 705
},
{
"epoch": 0.6171665847128885,
"grad_norm": 0.796875,
"learning_rate": 1.7481396176253313e-05,
"loss": 2.4907,
"step": 706
},
{
"epoch": 0.6180407583456264,
"grad_norm": 0.828125,
"learning_rate": 1.7412392025534012e-05,
"loss": 2.447,
"step": 707
},
{
"epoch": 0.6189149319783642,
"grad_norm": 0.8046875,
"learning_rate": 1.7343451487664214e-05,
"loss": 2.4827,
"step": 708
},
{
"epoch": 0.619789105611102,
"grad_norm": 0.78515625,
"learning_rate": 1.7274575140626318e-05,
"loss": 2.5083,
"step": 709
},
{
"epoch": 0.6206632792438398,
"grad_norm": 0.79296875,
"learning_rate": 1.72057635618646e-05,
"loss": 2.4876,
"step": 710
},
{
"epoch": 0.6215374528765776,
"grad_norm": 0.78515625,
"learning_rate": 1.713701732828032e-05,
"loss": 2.4839,
"step": 711
},
{
"epoch": 0.6224116265093154,
"grad_norm": 0.77734375,
"learning_rate": 1.7068337016226893e-05,
"loss": 2.5058,
"step": 712
},
{
"epoch": 0.6232858001420533,
"grad_norm": 0.79296875,
"learning_rate": 1.6999723201505078e-05,
"loss": 2.5269,
"step": 713
},
{
"epoch": 0.624159973774791,
"grad_norm": 0.7890625,
"learning_rate": 1.6931176459358126e-05,
"loss": 2.4452,
"step": 714
},
{
"epoch": 0.6250341474075288,
"grad_norm": 0.765625,
"learning_rate": 1.686269736446695e-05,
"loss": 2.4149,
"step": 715
},
{
"epoch": 0.6259083210402666,
"grad_norm": 0.78515625,
"learning_rate": 1.6794286490945342e-05,
"loss": 2.4508,
"step": 716
},
{
"epoch": 0.6267824946730044,
"grad_norm": 0.796875,
"learning_rate": 1.672594441233515e-05,
"loss": 2.4886,
"step": 717
},
{
"epoch": 0.6276566683057422,
"grad_norm": 0.81640625,
"learning_rate": 1.6657671701601434e-05,
"loss": 2.4674,
"step": 718
},
{
"epoch": 0.6285308419384801,
"grad_norm": 0.77734375,
"learning_rate": 1.6589468931127707e-05,
"loss": 2.4671,
"step": 719
},
{
"epoch": 0.6294050155712179,
"grad_norm": 0.765625,
"learning_rate": 1.6521336672711123e-05,
"loss": 2.4426,
"step": 720
},
{
"epoch": 0.6302791892039556,
"grad_norm": 0.7890625,
"learning_rate": 1.645327549755765e-05,
"loss": 2.4402,
"step": 721
},
{
"epoch": 0.6311533628366934,
"grad_norm": 0.828125,
"learning_rate": 1.6385285976277337e-05,
"loss": 2.4235,
"step": 722
},
{
"epoch": 0.6320275364694312,
"grad_norm": 0.80859375,
"learning_rate": 1.6317368678879495e-05,
"loss": 2.4949,
"step": 723
},
{
"epoch": 0.632901710102169,
"grad_norm": 0.78125,
"learning_rate": 1.624952417476792e-05,
"loss": 2.4693,
"step": 724
},
{
"epoch": 0.6337758837349069,
"grad_norm": 0.78515625,
"learning_rate": 1.618175303273614e-05,
"loss": 2.4108,
"step": 725
},
{
"epoch": 0.6346500573676447,
"grad_norm": 0.796875,
"learning_rate": 1.6114055820962617e-05,
"loss": 2.5233,
"step": 726
},
{
"epoch": 0.6355242310003825,
"grad_norm": 0.78515625,
"learning_rate": 1.6046433107005994e-05,
"loss": 2.4736,
"step": 727
},
{
"epoch": 0.6363984046331203,
"grad_norm": 0.796875,
"learning_rate": 1.5978885457800345e-05,
"loss": 2.4318,
"step": 728
},
{
"epoch": 0.637272578265858,
"grad_norm": 0.78125,
"learning_rate": 1.5911413439650436e-05,
"loss": 2.4888,
"step": 729
},
{
"epoch": 0.6381467518985958,
"grad_norm": 0.8046875,
"learning_rate": 1.5844017618226935e-05,
"loss": 2.5133,
"step": 730
},
{
"epoch": 0.6390209255313336,
"grad_norm": 0.796875,
"learning_rate": 1.5776698558561696e-05,
"loss": 2.4708,
"step": 731
},
{
"epoch": 0.6398950991640715,
"grad_norm": 0.76953125,
"learning_rate": 1.5709456825043046e-05,
"loss": 2.4479,
"step": 732
},
{
"epoch": 0.6407692727968093,
"grad_norm": 0.77734375,
"learning_rate": 1.5642292981410976e-05,
"loss": 2.4754,
"step": 733
},
{
"epoch": 0.6416434464295471,
"grad_norm": 0.7890625,
"learning_rate": 1.557520759075251e-05,
"loss": 2.4537,
"step": 734
},
{
"epoch": 0.6425176200622849,
"grad_norm": 0.7734375,
"learning_rate": 1.5508201215496926e-05,
"loss": 2.4646,
"step": 735
},
{
"epoch": 0.6433917936950226,
"grad_norm": 0.8046875,
"learning_rate": 1.5441274417411053e-05,
"loss": 2.5035,
"step": 736
},
{
"epoch": 0.6442659673277604,
"grad_norm": 0.76171875,
"learning_rate": 1.5374427757594552e-05,
"loss": 2.4973,
"step": 737
},
{
"epoch": 0.6451401409604983,
"grad_norm": 0.75390625,
"learning_rate": 1.5307661796475247e-05,
"loss": 2.351,
"step": 738
},
{
"epoch": 0.6460143145932361,
"grad_norm": 0.75390625,
"learning_rate": 1.5240977093804365e-05,
"loss": 2.457,
"step": 739
},
{
"epoch": 0.6468884882259739,
"grad_norm": 0.7890625,
"learning_rate": 1.5174374208651912e-05,
"loss": 2.4152,
"step": 740
},
{
"epoch": 0.6477626618587117,
"grad_norm": 0.80859375,
"learning_rate": 1.5107853699401945e-05,
"loss": 2.4671,
"step": 741
},
{
"epoch": 0.6486368354914495,
"grad_norm": 0.78125,
"learning_rate": 1.5041416123747899e-05,
"loss": 2.4371,
"step": 742
},
{
"epoch": 0.6495110091241872,
"grad_norm": 0.78515625,
"learning_rate": 1.4975062038687904e-05,
"loss": 2.4177,
"step": 743
},
{
"epoch": 0.6503851827569251,
"grad_norm": 0.79296875,
"learning_rate": 1.4908792000520141e-05,
"loss": 2.4789,
"step": 744
},
{
"epoch": 0.6512593563896629,
"grad_norm": 0.7890625,
"learning_rate": 1.484260656483813e-05,
"loss": 2.456,
"step": 745
},
{
"epoch": 0.6521335300224007,
"grad_norm": 0.8046875,
"learning_rate": 1.4776506286526131e-05,
"loss": 2.4577,
"step": 746
},
{
"epoch": 0.6530077036551385,
"grad_norm": 0.78125,
"learning_rate": 1.4710491719754454e-05,
"loss": 2.4199,
"step": 747
},
{
"epoch": 0.6538818772878763,
"grad_norm": 0.796875,
"learning_rate": 1.4644563417974827e-05,
"loss": 2.4352,
"step": 748
},
{
"epoch": 0.6547560509206141,
"grad_norm": 3.328125,
"learning_rate": 1.4578721933915723e-05,
"loss": 2.4898,
"step": 749
},
{
"epoch": 0.655630224553352,
"grad_norm": 0.79296875,
"learning_rate": 1.4512967819577815e-05,
"loss": 2.4195,
"step": 750
},
{
"epoch": 0.6565043981860897,
"grad_norm": 0.82421875,
"learning_rate": 1.4447301626229204e-05,
"loss": 2.5405,
"step": 751
},
{
"epoch": 0.6573785718188275,
"grad_norm": 0.78125,
"learning_rate": 1.4381723904400957e-05,
"loss": 2.4603,
"step": 752
},
{
"epoch": 0.6582527454515653,
"grad_norm": 0.80078125,
"learning_rate": 1.4316235203882371e-05,
"loss": 2.4595,
"step": 753
},
{
"epoch": 0.6591269190843031,
"grad_norm": 0.79296875,
"learning_rate": 1.4250836073716411e-05,
"loss": 2.4936,
"step": 754
},
{
"epoch": 0.6600010927170409,
"grad_norm": 0.78515625,
"learning_rate": 1.418552706219514e-05,
"loss": 2.4954,
"step": 755
},
{
"epoch": 0.6608752663497788,
"grad_norm": 0.80078125,
"learning_rate": 1.4120308716855038e-05,
"loss": 2.4924,
"step": 756
},
{
"epoch": 0.6617494399825166,
"grad_norm": 0.81640625,
"learning_rate": 1.4055181584472488e-05,
"loss": 2.4962,
"step": 757
},
{
"epoch": 0.6626236136152543,
"grad_norm": 0.8046875,
"learning_rate": 1.399014621105914e-05,
"loss": 2.4441,
"step": 758
},
{
"epoch": 0.6634977872479921,
"grad_norm": 0.82421875,
"learning_rate": 1.3925203141857398e-05,
"loss": 2.5086,
"step": 759
},
{
"epoch": 0.6643719608807299,
"grad_norm": 0.78515625,
"learning_rate": 1.386035292133577e-05,
"loss": 2.4698,
"step": 760
},
{
"epoch": 0.6652461345134677,
"grad_norm": 0.77734375,
"learning_rate": 1.3795596093184344e-05,
"loss": 2.4269,
"step": 761
},
{
"epoch": 0.6661203081462055,
"grad_norm": 0.7890625,
"learning_rate": 1.3730933200310252e-05,
"loss": 2.4506,
"step": 762
},
{
"epoch": 0.6669944817789434,
"grad_norm": 0.80859375,
"learning_rate": 1.3666364784833075e-05,
"loss": 2.4774,
"step": 763
},
{
"epoch": 0.6678686554116812,
"grad_norm": 0.78125,
"learning_rate": 1.3601891388080313e-05,
"loss": 2.4573,
"step": 764
},
{
"epoch": 0.668742829044419,
"grad_norm": 0.77734375,
"learning_rate": 1.3537513550582853e-05,
"loss": 2.4305,
"step": 765
},
{
"epoch": 0.6696170026771567,
"grad_norm": 0.796875,
"learning_rate": 1.3473231812070427e-05,
"loss": 2.4808,
"step": 766
},
{
"epoch": 0.6704911763098945,
"grad_norm": 0.7890625,
"learning_rate": 1.3409046711467127e-05,
"loss": 2.4969,
"step": 767
},
{
"epoch": 0.6713653499426323,
"grad_norm": 0.7734375,
"learning_rate": 1.3344958786886808e-05,
"loss": 2.4678,
"step": 768
},
{
"epoch": 0.6722395235753702,
"grad_norm": 0.765625,
"learning_rate": 1.3280968575628674e-05,
"loss": 2.4409,
"step": 769
},
{
"epoch": 0.673113697208108,
"grad_norm": 0.78515625,
"learning_rate": 1.3217076614172652e-05,
"loss": 2.5037,
"step": 770
},
{
"epoch": 0.6739878708408458,
"grad_norm": 0.7734375,
"learning_rate": 1.3153283438175034e-05,
"loss": 2.4852,
"step": 771
},
{
"epoch": 0.6748620444735836,
"grad_norm": 0.7890625,
"learning_rate": 1.3089589582463879e-05,
"loss": 2.4512,
"step": 772
},
{
"epoch": 0.6757362181063213,
"grad_norm": 0.78515625,
"learning_rate": 1.3025995581034561e-05,
"loss": 2.4298,
"step": 773
},
{
"epoch": 0.6766103917390591,
"grad_norm": 0.77734375,
"learning_rate": 1.2962501967045332e-05,
"loss": 2.4524,
"step": 774
},
{
"epoch": 0.677484565371797,
"grad_norm": 0.765625,
"learning_rate": 1.2899109272812788e-05,
"loss": 2.4817,
"step": 775
},
{
"epoch": 0.6783587390045348,
"grad_norm": 0.77734375,
"learning_rate": 1.283581802980744e-05,
"loss": 2.4183,
"step": 776
},
{
"epoch": 0.6792329126372726,
"grad_norm": 0.78515625,
"learning_rate": 1.2772628768649247e-05,
"loss": 2.4454,
"step": 777
},
{
"epoch": 0.6801070862700104,
"grad_norm": 0.8046875,
"learning_rate": 1.270954201910321e-05,
"loss": 2.4866,
"step": 778
},
{
"epoch": 0.6809812599027482,
"grad_norm": 0.77734375,
"learning_rate": 1.264655831007486e-05,
"loss": 2.4648,
"step": 779
},
{
"epoch": 0.6818554335354859,
"grad_norm": 0.7734375,
"learning_rate": 1.2583678169605857e-05,
"loss": 2.416,
"step": 780
},
{
"epoch": 0.6827296071682238,
"grad_norm": 0.796875,
"learning_rate": 1.2520902124869605e-05,
"loss": 2.4401,
"step": 781
},
{
"epoch": 0.6836037808009616,
"grad_norm": 0.77734375,
"learning_rate": 1.245823070216673e-05,
"loss": 2.4564,
"step": 782
},
{
"epoch": 0.6844779544336994,
"grad_norm": 0.79296875,
"learning_rate": 1.239566442692079e-05,
"loss": 2.5142,
"step": 783
},
{
"epoch": 0.6853521280664372,
"grad_norm": 0.796875,
"learning_rate": 1.2333203823673773e-05,
"loss": 2.44,
"step": 784
},
{
"epoch": 0.686226301699175,
"grad_norm": 0.78125,
"learning_rate": 1.2270849416081737e-05,
"loss": 2.481,
"step": 785
},
{
"epoch": 0.6871004753319128,
"grad_norm": 0.77734375,
"learning_rate": 1.2208601726910446e-05,
"loss": 2.4822,
"step": 786
},
{
"epoch": 0.6879746489646507,
"grad_norm": 0.78125,
"learning_rate": 1.2146461278030938e-05,
"loss": 2.4373,
"step": 787
},
{
"epoch": 0.6888488225973884,
"grad_norm": 0.796875,
"learning_rate": 1.2084428590415172e-05,
"loss": 2.4376,
"step": 788
},
{
"epoch": 0.6897229962301262,
"grad_norm": 0.78515625,
"learning_rate": 1.2022504184131656e-05,
"loss": 2.4519,
"step": 789
},
{
"epoch": 0.690597169862864,
"grad_norm": 0.7734375,
"learning_rate": 1.1960688578341117e-05,
"loss": 2.3984,
"step": 790
},
{
"epoch": 0.6914713434956018,
"grad_norm": 0.76953125,
"learning_rate": 1.1898982291292096e-05,
"loss": 2.4713,
"step": 791
},
{
"epoch": 0.6923455171283396,
"grad_norm": 0.78515625,
"learning_rate": 1.1837385840316628e-05,
"loss": 2.453,
"step": 792
},
{
"epoch": 0.6932196907610774,
"grad_norm": 0.91796875,
"learning_rate": 1.1775899741825947e-05,
"loss": 2.4441,
"step": 793
},
{
"epoch": 0.6940938643938153,
"grad_norm": 0.7734375,
"learning_rate": 1.1714524511306043e-05,
"loss": 2.4505,
"step": 794
},
{
"epoch": 0.694968038026553,
"grad_norm": 0.8359375,
"learning_rate": 1.165326066331349e-05,
"loss": 2.4896,
"step": 795
},
{
"epoch": 0.6958422116592908,
"grad_norm": 0.79296875,
"learning_rate": 1.1592108711470995e-05,
"loss": 2.4831,
"step": 796
},
{
"epoch": 0.6967163852920286,
"grad_norm": 0.796875,
"learning_rate": 1.1531069168463202e-05,
"loss": 2.4131,
"step": 797
},
{
"epoch": 0.6975905589247664,
"grad_norm": 0.796875,
"learning_rate": 1.1470142546032304e-05,
"loss": 2.4331,
"step": 798
},
{
"epoch": 0.6984647325575042,
"grad_norm": 0.8125,
"learning_rate": 1.1409329354973814e-05,
"loss": 2.4559,
"step": 799
},
{
"epoch": 0.6993389061902421,
"grad_norm": 0.77734375,
"learning_rate": 1.1348630105132253e-05,
"loss": 2.5032,
"step": 800
},
{
"epoch": 0.7002130798229799,
"grad_norm": 0.77734375,
"learning_rate": 1.1288045305396874e-05,
"loss": 2.4401,
"step": 801
},
{
"epoch": 0.7010872534557177,
"grad_norm": 0.76953125,
"learning_rate": 1.122757546369744e-05,
"loss": 2.4558,
"step": 802
},
{
"epoch": 0.7019614270884554,
"grad_norm": 1.28125,
"learning_rate": 1.1167221086999895e-05,
"loss": 2.5234,
"step": 803
},
{
"epoch": 0.7028356007211932,
"grad_norm": 0.76953125,
"learning_rate": 1.1106982681302159e-05,
"loss": 2.4717,
"step": 804
},
{
"epoch": 0.703709774353931,
"grad_norm": 0.765625,
"learning_rate": 1.10468607516299e-05,
"loss": 2.4759,
"step": 805
},
{
"epoch": 0.7045839479866689,
"grad_norm": 0.7890625,
"learning_rate": 1.0986855802032225e-05,
"loss": 2.5144,
"step": 806
},
{
"epoch": 0.7054581216194067,
"grad_norm": 0.78125,
"learning_rate": 1.0926968335577564e-05,
"loss": 2.4884,
"step": 807
},
{
"epoch": 0.7063322952521445,
"grad_norm": 0.81640625,
"learning_rate": 1.086719885434935e-05,
"loss": 2.4915,
"step": 808
},
{
"epoch": 0.7072064688848823,
"grad_norm": 0.76171875,
"learning_rate": 1.0807547859441885e-05,
"loss": 2.4426,
"step": 809
},
{
"epoch": 0.70808064251762,
"grad_norm": 0.7578125,
"learning_rate": 1.0748015850956086e-05,
"loss": 2.4774,
"step": 810
},
{
"epoch": 0.7089548161503578,
"grad_norm": 0.8046875,
"learning_rate": 1.0688603327995323e-05,
"loss": 2.4552,
"step": 811
},
{
"epoch": 0.7098289897830957,
"grad_norm": 0.78125,
"learning_rate": 1.0629310788661222e-05,
"loss": 2.444,
"step": 812
},
{
"epoch": 0.7107031634158335,
"grad_norm": 0.80078125,
"learning_rate": 1.0570138730049484e-05,
"loss": 2.4437,
"step": 813
},
{
"epoch": 0.7115773370485713,
"grad_norm": 0.7890625,
"learning_rate": 1.0511087648245757e-05,
"loss": 2.4354,
"step": 814
},
{
"epoch": 0.7124515106813091,
"grad_norm": 0.80078125,
"learning_rate": 1.0452158038321402e-05,
"loss": 2.499,
"step": 815
},
{
"epoch": 0.7133256843140469,
"grad_norm": 0.76171875,
"learning_rate": 1.0393350394329429e-05,
"loss": 2.4979,
"step": 816
},
{
"epoch": 0.7141998579467846,
"grad_norm": 0.78515625,
"learning_rate": 1.0334665209300295e-05,
"loss": 2.5171,
"step": 817
},
{
"epoch": 0.7150740315795224,
"grad_norm": 0.7890625,
"learning_rate": 1.0276102975237754e-05,
"loss": 2.4536,
"step": 818
},
{
"epoch": 0.7159482052122603,
"grad_norm": 0.796875,
"learning_rate": 1.0217664183114825e-05,
"loss": 2.4536,
"step": 819
},
{
"epoch": 0.7168223788449981,
"grad_norm": 0.77734375,
"learning_rate": 1.0159349322869574e-05,
"loss": 2.4038,
"step": 820
},
{
"epoch": 0.7176965524777359,
"grad_norm": 0.78125,
"learning_rate": 1.0101158883401077e-05,
"loss": 2.4728,
"step": 821
},
{
"epoch": 0.7185707261104737,
"grad_norm": 0.78125,
"learning_rate": 1.0043093352565272e-05,
"loss": 2.4679,
"step": 822
},
{
"epoch": 0.7194448997432115,
"grad_norm": 0.78125,
"learning_rate": 9.985153217170903e-06,
"loss": 2.5158,
"step": 823
},
{
"epoch": 0.7203190733759492,
"grad_norm": 0.7734375,
"learning_rate": 9.927338962975416e-06,
"loss": 2.4396,
"step": 824
},
{
"epoch": 0.7211932470086871,
"grad_norm": 0.77734375,
"learning_rate": 9.869651074680893e-06,
"loss": 2.4815,
"step": 825
},
{
"epoch": 0.7220674206414249,
"grad_norm": 0.77734375,
"learning_rate": 9.812090035930024e-06,
"loss": 2.3869,
"step": 826
},
{
"epoch": 0.7229415942741627,
"grad_norm": 0.79296875,
"learning_rate": 9.754656329301976e-06,
"loss": 2.4363,
"step": 827
},
{
"epoch": 0.7238157679069005,
"grad_norm": 0.78515625,
"learning_rate": 9.697350436308427e-06,
"loss": 2.4645,
"step": 828
},
{
"epoch": 0.7246899415396383,
"grad_norm": 0.765625,
"learning_rate": 9.640172837389475e-06,
"loss": 2.4348,
"step": 829
},
{
"epoch": 0.7255641151723761,
"grad_norm": 0.765625,
"learning_rate": 9.583124011909628e-06,
"loss": 2.5198,
"step": 830
},
{
"epoch": 0.726438288805114,
"grad_norm": 0.796875,
"learning_rate": 9.526204438153794e-06,
"loss": 2.4499,
"step": 831
},
{
"epoch": 0.7273124624378517,
"grad_norm": 0.78515625,
"learning_rate": 9.469414593323242e-06,
"loss": 2.4385,
"step": 832
},
{
"epoch": 0.7281866360705895,
"grad_norm": 0.80078125,
"learning_rate": 9.412754953531663e-06,
"loss": 2.4505,
"step": 833
},
{
"epoch": 0.7290608097033273,
"grad_norm": 0.7734375,
"learning_rate": 9.356225993801101e-06,
"loss": 2.4464,
"step": 834
},
{
"epoch": 0.7299349833360651,
"grad_norm": 0.76953125,
"learning_rate": 9.299828188058013e-06,
"loss": 2.3962,
"step": 835
},
{
"epoch": 0.7308091569688029,
"grad_norm": 0.81640625,
"learning_rate": 9.243562009129316e-06,
"loss": 2.4827,
"step": 836
},
{
"epoch": 0.7316833306015408,
"grad_norm": 0.79296875,
"learning_rate": 9.187427928738343e-06,
"loss": 2.5232,
"step": 837
},
{
"epoch": 0.7325575042342786,
"grad_norm": 0.765625,
"learning_rate": 9.131426417501005e-06,
"loss": 2.4248,
"step": 838
},
{
"epoch": 0.7334316778670164,
"grad_norm": 0.765625,
"learning_rate": 9.075557944921728e-06,
"loss": 2.5111,
"step": 839
},
{
"epoch": 0.7343058514997541,
"grad_norm": 0.76171875,
"learning_rate": 9.019822979389614e-06,
"loss": 2.3708,
"step": 840
},
{
"epoch": 0.7351800251324919,
"grad_norm": 0.80078125,
"learning_rate": 8.964221988174442e-06,
"loss": 2.4439,
"step": 841
},
{
"epoch": 0.7360541987652297,
"grad_norm": 0.76171875,
"learning_rate": 8.908755437422792e-06,
"loss": 2.451,
"step": 842
},
{
"epoch": 0.7369283723979676,
"grad_norm": 0.77734375,
"learning_rate": 8.85342379215412e-06,
"loss": 2.4452,
"step": 843
},
{
"epoch": 0.7378025460307054,
"grad_norm": 0.74609375,
"learning_rate": 8.798227516256854e-06,
"loss": 2.4486,
"step": 844
},
{
"epoch": 0.7386767196634432,
"grad_norm": 0.77734375,
"learning_rate": 8.743167072484549e-06,
"loss": 2.4509,
"step": 845
},
{
"epoch": 0.739550893296181,
"grad_norm": 0.77734375,
"learning_rate": 8.688242922451928e-06,
"loss": 2.4726,
"step": 846
},
{
"epoch": 0.7404250669289187,
"grad_norm": 0.78125,
"learning_rate": 8.633455526631098e-06,
"loss": 2.5167,
"step": 847
},
{
"epoch": 0.7412992405616565,
"grad_norm": 0.78515625,
"learning_rate": 8.578805344347623e-06,
"loss": 2.4329,
"step": 848
},
{
"epoch": 0.7421734141943943,
"grad_norm": 0.76171875,
"learning_rate": 8.524292833776706e-06,
"loss": 2.4932,
"step": 849
},
{
"epoch": 0.7430475878271322,
"grad_norm": 0.78125,
"learning_rate": 8.469918451939334e-06,
"loss": 2.4598,
"step": 850
},
{
"epoch": 0.74392176145987,
"grad_norm": 0.78125,
"learning_rate": 8.415682654698459e-06,
"loss": 2.4279,
"step": 851
},
{
"epoch": 0.7447959350926078,
"grad_norm": 0.765625,
"learning_rate": 8.361585896755181e-06,
"loss": 2.459,
"step": 852
},
{
"epoch": 0.7456701087253456,
"grad_norm": 0.77734375,
"learning_rate": 8.307628631644903e-06,
"loss": 2.4687,
"step": 853
},
{
"epoch": 0.7465442823580833,
"grad_norm": 0.77734375,
"learning_rate": 8.253811311733567e-06,
"loss": 2.5331,
"step": 854
},
{
"epoch": 0.7474184559908211,
"grad_norm": 0.79296875,
"learning_rate": 8.200134388213837e-06,
"loss": 2.4478,
"step": 855
},
{
"epoch": 0.748292629623559,
"grad_norm": 0.79296875,
"learning_rate": 8.146598311101317e-06,
"loss": 2.4979,
"step": 856
},
{
"epoch": 0.7491668032562968,
"grad_norm": 0.765625,
"learning_rate": 8.09320352923081e-06,
"loss": 2.4402,
"step": 857
},
{
"epoch": 0.7500409768890346,
"grad_norm": 0.7734375,
"learning_rate": 8.039950490252505e-06,
"loss": 2.4791,
"step": 858
},
{
"epoch": 0.7509151505217724,
"grad_norm": 0.78125,
"learning_rate": 7.986839640628268e-06,
"loss": 2.4827,
"step": 859
},
{
"epoch": 0.7517893241545102,
"grad_norm": 0.7734375,
"learning_rate": 7.93387142562787e-06,
"loss": 2.4664,
"step": 860
},
{
"epoch": 0.752663497787248,
"grad_norm": 0.7734375,
"learning_rate": 7.881046289325268e-06,
"loss": 2.5298,
"step": 861
},
{
"epoch": 0.7535376714199858,
"grad_norm": 0.78515625,
"learning_rate": 7.82836467459487e-06,
"loss": 2.4936,
"step": 862
},
{
"epoch": 0.7544118450527236,
"grad_norm": 0.78125,
"learning_rate": 7.775827023107835e-06,
"loss": 2.4657,
"step": 863
},
{
"epoch": 0.7552860186854614,
"grad_norm": 0.7734375,
"learning_rate": 7.723433775328384e-06,
"loss": 2.4861,
"step": 864
},
{
"epoch": 0.7561601923181992,
"grad_norm": 0.76953125,
"learning_rate": 7.671185370510059e-06,
"loss": 2.4551,
"step": 865
},
{
"epoch": 0.757034365950937,
"grad_norm": 0.78125,
"learning_rate": 7.619082246692103e-06,
"loss": 2.4114,
"step": 866
},
{
"epoch": 0.7579085395836748,
"grad_norm": 0.765625,
"learning_rate": 7.567124840695708e-06,
"loss": 2.3837,
"step": 867
},
{
"epoch": 0.7587827132164127,
"grad_norm": 0.7734375,
"learning_rate": 7.515313588120451e-06,
"loss": 2.4788,
"step": 868
},
{
"epoch": 0.7596568868491504,
"grad_norm": 0.765625,
"learning_rate": 7.463648923340558e-06,
"loss": 2.45,
"step": 869
},
{
"epoch": 0.7605310604818882,
"grad_norm": 0.7734375,
"learning_rate": 7.412131279501297e-06,
"loss": 2.4414,
"step": 870
},
{
"epoch": 0.761405234114626,
"grad_norm": 0.78515625,
"learning_rate": 7.36076108851537e-06,
"loss": 2.4682,
"step": 871
},
{
"epoch": 0.7622794077473638,
"grad_norm": 0.78125,
"learning_rate": 7.309538781059239e-06,
"loss": 2.5377,
"step": 872
},
{
"epoch": 0.7631535813801016,
"grad_norm": 0.7734375,
"learning_rate": 7.258464786569549e-06,
"loss": 2.4896,
"step": 873
},
{
"epoch": 0.7640277550128394,
"grad_norm": 0.80078125,
"learning_rate": 7.207539533239527e-06,
"loss": 2.5085,
"step": 874
},
{
"epoch": 0.7649019286455773,
"grad_norm": 0.78515625,
"learning_rate": 7.156763448015377e-06,
"loss": 2.4876,
"step": 875
},
{
"epoch": 0.765776102278315,
"grad_norm": 0.78515625,
"learning_rate": 7.106136956592729e-06,
"loss": 2.4602,
"step": 876
},
{
"epoch": 0.7666502759110528,
"grad_norm": 0.78515625,
"learning_rate": 7.055660483413029e-06,
"loss": 2.5205,
"step": 877
},
{
"epoch": 0.7675244495437906,
"grad_norm": 0.765625,
"learning_rate": 7.005334451660034e-06,
"loss": 2.4711,
"step": 878
},
{
"epoch": 0.7683986231765284,
"grad_norm": 0.7734375,
"learning_rate": 6.95515928325618e-06,
"loss": 2.4817,
"step": 879
},
{
"epoch": 0.7692727968092662,
"grad_norm": 0.76171875,
"learning_rate": 6.905135398859156e-06,
"loss": 2.4179,
"step": 880
},
{
"epoch": 0.7701469704420041,
"grad_norm": 0.7734375,
"learning_rate": 6.855263217858279e-06,
"loss": 2.4559,
"step": 881
},
{
"epoch": 0.7710211440747419,
"grad_norm": 0.7890625,
"learning_rate": 6.805543158371028e-06,
"loss": 2.4928,
"step": 882
},
{
"epoch": 0.7718953177074797,
"grad_norm": 0.78125,
"learning_rate": 6.7559756372395475e-06,
"loss": 2.4632,
"step": 883
},
{
"epoch": 0.7727694913402174,
"grad_norm": 0.7578125,
"learning_rate": 6.706561070027109e-06,
"loss": 2.4679,
"step": 884
},
{
"epoch": 0.7736436649729552,
"grad_norm": 0.796875,
"learning_rate": 6.657299871014664e-06,
"loss": 2.481,
"step": 885
},
{
"epoch": 0.774517838605693,
"grad_norm": 0.78125,
"learning_rate": 6.60819245319734e-06,
"loss": 2.4626,
"step": 886
},
{
"epoch": 0.7753920122384309,
"grad_norm": 0.77734375,
"learning_rate": 6.5592392282810364e-06,
"loss": 2.5,
"step": 887
},
{
"epoch": 0.7762661858711687,
"grad_norm": 0.77734375,
"learning_rate": 6.5104406066788915e-06,
"loss": 2.4614,
"step": 888
},
{
"epoch": 0.7771403595039065,
"grad_norm": 0.8046875,
"learning_rate": 6.461796997507899e-06,
"loss": 2.5137,
"step": 889
},
{
"epoch": 0.7780145331366443,
"grad_norm": 0.8203125,
"learning_rate": 6.4133088085854775e-06,
"loss": 2.5302,
"step": 890
},
{
"epoch": 0.778888706769382,
"grad_norm": 0.77734375,
"learning_rate": 6.3649764464260105e-06,
"loss": 2.4871,
"step": 891
},
{
"epoch": 0.7797628804021198,
"grad_norm": 0.7734375,
"learning_rate": 6.316800316237481e-06,
"loss": 2.5117,
"step": 892
},
{
"epoch": 0.7806370540348577,
"grad_norm": 0.78125,
"learning_rate": 6.268780821918044e-06,
"loss": 2.4903,
"step": 893
},
{
"epoch": 0.7815112276675955,
"grad_norm": 0.77734375,
"learning_rate": 6.220918366052661e-06,
"loss": 2.4107,
"step": 894
},
{
"epoch": 0.7823854013003333,
"grad_norm": 0.78515625,
"learning_rate": 6.173213349909729e-06,
"loss": 2.44,
"step": 895
},
{
"epoch": 0.7832595749330711,
"grad_norm": 0.80078125,
"learning_rate": 6.125666173437678e-06,
"loss": 2.456,
"step": 896
},
{
"epoch": 0.7841337485658089,
"grad_norm": 0.78125,
"learning_rate": 6.078277235261681e-06,
"loss": 2.4971,
"step": 897
},
{
"epoch": 0.7850079221985466,
"grad_norm": 0.79296875,
"learning_rate": 6.031046932680229e-06,
"loss": 2.4523,
"step": 898
},
{
"epoch": 0.7858820958312845,
"grad_norm": 0.78515625,
"learning_rate": 5.983975661661889e-06,
"loss": 2.4813,
"step": 899
},
{
"epoch": 0.7867562694640223,
"grad_norm": 0.75390625,
"learning_rate": 5.93706381684192e-06,
"loss": 2.4376,
"step": 900
},
{
"epoch": 0.7876304430967601,
"grad_norm": 0.76953125,
"learning_rate": 5.8903117915189875e-06,
"loss": 2.466,
"step": 901
},
{
"epoch": 0.7885046167294979,
"grad_norm": 0.7890625,
"learning_rate": 5.843719977651882e-06,
"loss": 2.47,
"step": 902
},
{
"epoch": 0.7893787903622357,
"grad_norm": 0.78515625,
"learning_rate": 5.7972887658561955e-06,
"loss": 2.4402,
"step": 903
},
{
"epoch": 0.7902529639949735,
"grad_norm": 0.79296875,
"learning_rate": 5.751018545401076e-06,
"loss": 2.4675,
"step": 904
},
{
"epoch": 0.7911271376277113,
"grad_norm": 0.81640625,
"learning_rate": 5.704909704205949e-06,
"loss": 2.4422,
"step": 905
},
{
"epoch": 0.7920013112604491,
"grad_norm": 0.80859375,
"learning_rate": 5.658962628837289e-06,
"loss": 2.4023,
"step": 906
},
{
"epoch": 0.7928754848931869,
"grad_norm": 0.80078125,
"learning_rate": 5.613177704505343e-06,
"loss": 2.4977,
"step": 907
},
{
"epoch": 0.7937496585259247,
"grad_norm": 0.76171875,
"learning_rate": 5.567555315060918e-06,
"loss": 2.4663,
"step": 908
},
{
"epoch": 0.7946238321586625,
"grad_norm": 0.76171875,
"learning_rate": 5.522095842992195e-06,
"loss": 2.4664,
"step": 909
},
{
"epoch": 0.7954980057914003,
"grad_norm": 0.7890625,
"learning_rate": 5.476799669421437e-06,
"loss": 2.4706,
"step": 910
},
{
"epoch": 0.7963721794241381,
"grad_norm": 0.765625,
"learning_rate": 5.431667174101901e-06,
"loss": 2.5069,
"step": 911
},
{
"epoch": 0.797246353056876,
"grad_norm": 0.7734375,
"learning_rate": 5.3866987354145724e-06,
"loss": 2.4853,
"step": 912
},
{
"epoch": 0.7981205266896138,
"grad_norm": 0.7890625,
"learning_rate": 5.3418947303650185e-06,
"loss": 2.4523,
"step": 913
},
{
"epoch": 0.7989947003223515,
"grad_norm": 0.7734375,
"learning_rate": 5.297255534580256e-06,
"loss": 2.4469,
"step": 914
},
{
"epoch": 0.7998688739550893,
"grad_norm": 0.76953125,
"learning_rate": 5.252781522305556e-06,
"loss": 2.4548,
"step": 915
},
{
"epoch": 0.8007430475878271,
"grad_norm": 0.78515625,
"learning_rate": 5.208473066401329e-06,
"loss": 2.4469,
"step": 916
},
{
"epoch": 0.8016172212205649,
"grad_norm": 0.96875,
"learning_rate": 5.164330538339995e-06,
"loss": 2.4885,
"step": 917
},
{
"epoch": 0.8024913948533028,
"grad_norm": 0.76953125,
"learning_rate": 5.120354308202893e-06,
"loss": 2.4482,
"step": 918
},
{
"epoch": 0.8033655684860406,
"grad_norm": 0.79296875,
"learning_rate": 5.076544744677128e-06,
"loss": 2.5142,
"step": 919
},
{
"epoch": 0.8042397421187784,
"grad_norm": 0.7734375,
"learning_rate": 5.032902215052515e-06,
"loss": 2.524,
"step": 920
},
{
"epoch": 0.8051139157515161,
"grad_norm": 0.7734375,
"learning_rate": 4.989427085218523e-06,
"loss": 2.4633,
"step": 921
},
{
"epoch": 0.8059880893842539,
"grad_norm": 0.76171875,
"learning_rate": 4.946119719661121e-06,
"loss": 2.4766,
"step": 922
},
{
"epoch": 0.8068622630169917,
"grad_norm": 0.76953125,
"learning_rate": 4.902980481459835e-06,
"loss": 2.5139,
"step": 923
},
{
"epoch": 0.8077364366497296,
"grad_norm": 0.75390625,
"learning_rate": 4.860009732284609e-06,
"loss": 2.4214,
"step": 924
},
{
"epoch": 0.8086106102824674,
"grad_norm": 0.78125,
"learning_rate": 4.817207832392842e-06,
"loss": 2.4861,
"step": 925
},
{
"epoch": 0.8094847839152052,
"grad_norm": 0.75390625,
"learning_rate": 4.7745751406263165e-06,
"loss": 2.4826,
"step": 926
},
{
"epoch": 0.810358957547943,
"grad_norm": 0.7578125,
"learning_rate": 4.732112014408213e-06,
"loss": 2.4694,
"step": 927
},
{
"epoch": 0.8112331311806807,
"grad_norm": 0.78515625,
"learning_rate": 4.689818809740118e-06,
"loss": 2.4753,
"step": 928
},
{
"epoch": 0.8121073048134185,
"grad_norm": 0.76171875,
"learning_rate": 4.647695881199024e-06,
"loss": 2.4846,
"step": 929
},
{
"epoch": 0.8129814784461563,
"grad_norm": 0.80078125,
"learning_rate": 4.605743581934385e-06,
"loss": 2.5179,
"step": 930
},
{
"epoch": 0.8138556520788942,
"grad_norm": 0.76953125,
"learning_rate": 4.563962263665114e-06,
"loss": 2.4131,
"step": 931
},
{
"epoch": 0.814729825711632,
"grad_norm": 0.765625,
"learning_rate": 4.522352276676661e-06,
"loss": 2.4216,
"step": 932
},
{
"epoch": 0.8156039993443698,
"grad_norm": 0.74609375,
"learning_rate": 4.480913969818098e-06,
"loss": 2.3917,
"step": 933
},
{
"epoch": 0.8164781729771076,
"grad_norm": 0.76953125,
"learning_rate": 4.439647690499122e-06,
"loss": 2.4532,
"step": 934
},
{
"epoch": 0.8173523466098453,
"grad_norm": 0.8046875,
"learning_rate": 4.398553784687226e-06,
"loss": 2.5099,
"step": 935
},
{
"epoch": 0.8182265202425831,
"grad_norm": 0.76953125,
"learning_rate": 4.357632596904743e-06,
"loss": 2.4413,
"step": 936
},
{
"epoch": 0.819100693875321,
"grad_norm": 0.78515625,
"learning_rate": 4.31688447022599e-06,
"loss": 2.4713,
"step": 937
},
{
"epoch": 0.8199748675080588,
"grad_norm": 0.78125,
"learning_rate": 4.276309746274368e-06,
"loss": 2.4015,
"step": 938
},
{
"epoch": 0.8208490411407966,
"grad_norm": 0.82421875,
"learning_rate": 4.235908765219504e-06,
"loss": 2.4893,
"step": 939
},
{
"epoch": 0.8217232147735344,
"grad_norm": 0.7734375,
"learning_rate": 4.195681865774406e-06,
"loss": 2.4033,
"step": 940
},
{
"epoch": 0.8225973884062722,
"grad_norm": 0.765625,
"learning_rate": 4.155629385192619e-06,
"loss": 2.4699,
"step": 941
},
{
"epoch": 0.82347156203901,
"grad_norm": 0.76953125,
"learning_rate": 4.115751659265407e-06,
"loss": 2.4399,
"step": 942
},
{
"epoch": 0.8243457356717478,
"grad_norm": 0.79296875,
"learning_rate": 4.0760490223189144e-06,
"loss": 2.4812,
"step": 943
},
{
"epoch": 0.8252199093044856,
"grad_norm": 0.8671875,
"learning_rate": 4.036521807211393e-06,
"loss": 2.4413,
"step": 944
},
{
"epoch": 0.8260940829372234,
"grad_norm": 0.7734375,
"learning_rate": 3.997170345330387e-06,
"loss": 2.4954,
"step": 945
},
{
"epoch": 0.8269682565699612,
"grad_norm": 0.78125,
"learning_rate": 3.957994966589965e-06,
"loss": 2.4214,
"step": 946
},
{
"epoch": 0.827842430202699,
"grad_norm": 0.78125,
"learning_rate": 3.918995999427949e-06,
"loss": 2.4462,
"step": 947
},
{
"epoch": 0.8287166038354368,
"grad_norm": 0.74609375,
"learning_rate": 3.880173770803169e-06,
"loss": 2.3887,
"step": 948
},
{
"epoch": 0.8295907774681747,
"grad_norm": 0.7578125,
"learning_rate": 3.8415286061927265e-06,
"loss": 2.4514,
"step": 949
},
{
"epoch": 0.8304649511009125,
"grad_norm": 0.7734375,
"learning_rate": 3.8030608295892416e-06,
"loss": 2.4721,
"step": 950
},
{
"epoch": 0.8313391247336502,
"grad_norm": 0.765625,
"learning_rate": 3.764770763498163e-06,
"loss": 2.5131,
"step": 951
},
{
"epoch": 0.832213298366388,
"grad_norm": 0.765625,
"learning_rate": 3.726658728935048e-06,
"loss": 2.4598,
"step": 952
},
{
"epoch": 0.8330874719991258,
"grad_norm": 0.76953125,
"learning_rate": 3.688725045422867e-06,
"loss": 2.4106,
"step": 953
},
{
"epoch": 0.8339616456318636,
"grad_norm": 0.76953125,
"learning_rate": 3.6509700309893618e-06,
"loss": 2.4475,
"step": 954
},
{
"epoch": 0.8348358192646015,
"grad_norm": 0.7890625,
"learning_rate": 3.613394002164322e-06,
"loss": 2.4791,
"step": 955
},
{
"epoch": 0.8357099928973393,
"grad_norm": 0.7578125,
"learning_rate": 3.575997273976983e-06,
"loss": 2.4782,
"step": 956
},
{
"epoch": 0.8365841665300771,
"grad_norm": 0.77734375,
"learning_rate": 3.5387801599533475e-06,
"loss": 2.4587,
"step": 957
},
{
"epoch": 0.8374583401628148,
"grad_norm": 0.7890625,
"learning_rate": 3.5017429721135807e-06,
"loss": 2.4663,
"step": 958
},
{
"epoch": 0.8383325137955526,
"grad_norm": 0.7890625,
"learning_rate": 3.4648860209693794e-06,
"loss": 2.4975,
"step": 959
},
{
"epoch": 0.8392066874282904,
"grad_norm": 0.76171875,
"learning_rate": 3.428209615521377e-06,
"loss": 2.3964,
"step": 960
},
{
"epoch": 0.8400808610610282,
"grad_norm": 0.7578125,
"learning_rate": 3.3917140632565624e-06,
"loss": 2.4558,
"step": 961
},
{
"epoch": 0.8409550346937661,
"grad_norm": 0.7734375,
"learning_rate": 3.35539967014567e-06,
"loss": 2.5099,
"step": 962
},
{
"epoch": 0.8418292083265039,
"grad_norm": 0.765625,
"learning_rate": 3.319266740640661e-06,
"loss": 2.4713,
"step": 963
},
{
"epoch": 0.8427033819592417,
"grad_norm": 0.7734375,
"learning_rate": 3.283315577672122e-06,
"loss": 2.4063,
"step": 964
},
{
"epoch": 0.8435775555919794,
"grad_norm": 0.76953125,
"learning_rate": 3.2475464826467627e-06,
"loss": 2.4831,
"step": 965
},
{
"epoch": 0.8444517292247172,
"grad_norm": 0.78515625,
"learning_rate": 3.2119597554448657e-06,
"loss": 2.4549,
"step": 966
},
{
"epoch": 0.845325902857455,
"grad_norm": 0.7734375,
"learning_rate": 3.1765556944177823e-06,
"loss": 2.4457,
"step": 967
},
{
"epoch": 0.8462000764901929,
"grad_norm": 0.7734375,
"learning_rate": 3.141334596385448e-06,
"loss": 2.4424,
"step": 968
},
{
"epoch": 0.8470742501229307,
"grad_norm": 0.75390625,
"learning_rate": 3.106296756633853e-06,
"loss": 2.4523,
"step": 969
},
{
"epoch": 0.8479484237556685,
"grad_norm": 0.7734375,
"learning_rate": 3.0714424689126024e-06,
"loss": 2.4693,
"step": 970
},
{
"epoch": 0.8488225973884063,
"grad_norm": 0.76171875,
"learning_rate": 3.0367720254324357e-06,
"loss": 2.4814,
"step": 971
},
{
"epoch": 0.849696771021144,
"grad_norm": 0.765625,
"learning_rate": 3.002285716862785e-06,
"loss": 2.4373,
"step": 972
},
{
"epoch": 0.8505709446538818,
"grad_norm": 0.78125,
"learning_rate": 2.967983832329341e-06,
"loss": 2.4637,
"step": 973
},
{
"epoch": 0.8514451182866197,
"grad_norm": 0.77734375,
"learning_rate": 2.9338666594116134e-06,
"loss": 2.4313,
"step": 974
},
{
"epoch": 0.8523192919193575,
"grad_norm": 0.75390625,
"learning_rate": 2.8999344841405373e-06,
"loss": 2.4572,
"step": 975
},
{
"epoch": 0.8531934655520953,
"grad_norm": 0.7734375,
"learning_rate": 2.8661875909960695e-06,
"loss": 2.4709,
"step": 976
},
{
"epoch": 0.8540676391848331,
"grad_norm": 0.7734375,
"learning_rate": 2.8326262629047917e-06,
"loss": 2.4401,
"step": 977
},
{
"epoch": 0.8549418128175709,
"grad_norm": 0.77734375,
"learning_rate": 2.7992507812375556e-06,
"loss": 2.46,
"step": 978
},
{
"epoch": 0.8558159864503087,
"grad_norm": 0.765625,
"learning_rate": 2.766061425807112e-06,
"loss": 2.451,
"step": 979
},
{
"epoch": 0.8566901600830465,
"grad_norm": 0.77734375,
"learning_rate": 2.733058474865785e-06,
"loss": 2.4501,
"step": 980
},
{
"epoch": 0.8575643337157843,
"grad_norm": 0.76171875,
"learning_rate": 2.700242205103104e-06,
"loss": 2.4789,
"step": 981
},
{
"epoch": 0.8584385073485221,
"grad_norm": 0.7734375,
"learning_rate": 2.6676128916435256e-06,
"loss": 2.4685,
"step": 982
},
{
"epoch": 0.8593126809812599,
"grad_norm": 0.7578125,
"learning_rate": 2.635170808044077e-06,
"loss": 2.5047,
"step": 983
},
{
"epoch": 0.8601868546139977,
"grad_norm": 0.75,
"learning_rate": 2.602916226292121e-06,
"loss": 2.4509,
"step": 984
},
{
"epoch": 0.8610610282467355,
"grad_norm": 0.76171875,
"learning_rate": 2.5708494168030255e-06,
"loss": 2.4669,
"step": 985
},
{
"epoch": 0.8619352018794733,
"grad_norm": 0.78125,
"learning_rate": 2.538970648417921e-06,
"loss": 2.4744,
"step": 986
},
{
"epoch": 0.8628093755122112,
"grad_norm": 0.75390625,
"learning_rate": 2.507280188401456e-06,
"loss": 2.4848,
"step": 987
},
{
"epoch": 0.8636835491449489,
"grad_norm": 0.77734375,
"learning_rate": 2.475778302439524e-06,
"loss": 2.5041,
"step": 988
},
{
"epoch": 0.8645577227776867,
"grad_norm": 0.796875,
"learning_rate": 2.444465254637063e-06,
"loss": 2.434,
"step": 989
},
{
"epoch": 0.8654318964104245,
"grad_norm": 0.76953125,
"learning_rate": 2.4133413075158344e-06,
"loss": 2.4779,
"step": 990
},
{
"epoch": 0.8663060700431623,
"grad_norm": 0.76171875,
"learning_rate": 2.382406722012212e-06,
"loss": 2.4035,
"step": 991
},
{
"epoch": 0.8671802436759001,
"grad_norm": 0.76171875,
"learning_rate": 2.351661757475021e-06,
"loss": 2.428,
"step": 992
},
{
"epoch": 0.868054417308638,
"grad_norm": 0.76171875,
"learning_rate": 2.3211066716633257e-06,
"loss": 2.4896,
"step": 993
},
{
"epoch": 0.8689285909413758,
"grad_norm": 0.76953125,
"learning_rate": 2.2907417207443133e-06,
"loss": 2.4964,
"step": 994
},
{
"epoch": 0.8698027645741135,
"grad_norm": 0.76171875,
"learning_rate": 2.2605671592910824e-06,
"loss": 2.36,
"step": 995
},
{
"epoch": 0.8706769382068513,
"grad_norm": 0.75390625,
"learning_rate": 2.23058324028059e-06,
"loss": 2.4351,
"step": 996
},
{
"epoch": 0.8715511118395891,
"grad_norm": 0.78515625,
"learning_rate": 2.200790215091464e-06,
"loss": 2.51,
"step": 997
},
{
"epoch": 0.8724252854723269,
"grad_norm": 0.765625,
"learning_rate": 2.1711883335019225e-06,
"loss": 2.4247,
"step": 998
},
{
"epoch": 0.8732994591050648,
"grad_norm": 0.77734375,
"learning_rate": 2.1417778436876867e-06,
"loss": 2.4565,
"step": 999
},
{
"epoch": 0.8741736327378026,
"grad_norm": 0.79296875,
"learning_rate": 2.1125589922198845e-06,
"loss": 2.4257,
"step": 1000
},
{
"epoch": 0.8750478063705404,
"grad_norm": 0.78125,
"learning_rate": 2.0835320240629898e-06,
"loss": 2.4488,
"step": 1001
},
{
"epoch": 0.8759219800032781,
"grad_norm": 0.75390625,
"learning_rate": 2.054697182572765e-06,
"loss": 2.4643,
"step": 1002
},
{
"epoch": 0.8767961536360159,
"grad_norm": 0.77734375,
"learning_rate": 2.026054709494235e-06,
"loss": 2.4566,
"step": 1003
},
{
"epoch": 0.8776703272687537,
"grad_norm": 0.76953125,
"learning_rate": 1.9976048449596413e-06,
"loss": 2.4389,
"step": 1004
},
{
"epoch": 0.8785445009014916,
"grad_norm": 0.78125,
"learning_rate": 1.969347827486437e-06,
"loss": 2.4926,
"step": 1005
},
{
"epoch": 0.8794186745342294,
"grad_norm": 0.76171875,
"learning_rate": 1.9412838939753015e-06,
"loss": 2.4809,
"step": 1006
},
{
"epoch": 0.8802928481669672,
"grad_norm": 0.75,
"learning_rate": 1.9134132797081096e-06,
"loss": 2.4691,
"step": 1007
},
{
"epoch": 0.881167021799705,
"grad_norm": 0.7734375,
"learning_rate": 1.8857362183460264e-06,
"loss": 2.4868,
"step": 1008
},
{
"epoch": 0.8820411954324427,
"grad_norm": 0.76171875,
"learning_rate": 1.8582529419274908e-06,
"loss": 2.4191,
"step": 1009
},
{
"epoch": 0.8829153690651805,
"grad_norm": 0.77734375,
"learning_rate": 1.830963680866285e-06,
"loss": 2.4697,
"step": 1010
},
{
"epoch": 0.8837895426979184,
"grad_norm": 0.77734375,
"learning_rate": 1.8038686639496344e-06,
"loss": 2.4417,
"step": 1011
},
{
"epoch": 0.8846637163306562,
"grad_norm": 0.78125,
"learning_rate": 1.7769681183362391e-06,
"loss": 2.4567,
"step": 1012
},
{
"epoch": 0.885537889963394,
"grad_norm": 0.77734375,
"learning_rate": 1.7502622695544036e-06,
"loss": 2.491,
"step": 1013
},
{
"epoch": 0.8864120635961318,
"grad_norm": 0.76953125,
"learning_rate": 1.723751341500135e-06,
"loss": 2.4712,
"step": 1014
},
{
"epoch": 0.8872862372288696,
"grad_norm": 0.76953125,
"learning_rate": 1.697435556435273e-06,
"loss": 2.4346,
"step": 1015
},
{
"epoch": 0.8881604108616074,
"grad_norm": 0.78125,
"learning_rate": 1.6713151349856182e-06,
"loss": 2.4909,
"step": 1016
},
{
"epoch": 0.8890345844943451,
"grad_norm": 0.7578125,
"learning_rate": 1.6453902961390738e-06,
"loss": 2.4376,
"step": 1017
},
{
"epoch": 0.889908758127083,
"grad_norm": 0.765625,
"learning_rate": 1.6196612572438429e-06,
"loss": 2.4464,
"step": 1018
},
{
"epoch": 0.8907829317598208,
"grad_norm": 0.78515625,
"learning_rate": 1.59412823400657e-06,
"loss": 2.458,
"step": 1019
},
{
"epoch": 0.8916571053925586,
"grad_norm": 0.76171875,
"learning_rate": 1.5687914404905496e-06,
"loss": 2.4608,
"step": 1020
},
{
"epoch": 0.8925312790252964,
"grad_norm": 0.77734375,
"learning_rate": 1.5436510891139232e-06,
"loss": 2.424,
"step": 1021
},
{
"epoch": 0.8934054526580342,
"grad_norm": 0.81640625,
"learning_rate": 1.5187073906479193e-06,
"loss": 2.3873,
"step": 1022
},
{
"epoch": 0.894279626290772,
"grad_norm": 0.7734375,
"learning_rate": 1.4939605542150598e-06,
"loss": 2.4393,
"step": 1023
},
{
"epoch": 0.8951537999235099,
"grad_norm": 0.78125,
"learning_rate": 1.4694107872874175e-06,
"loss": 2.4266,
"step": 1024
},
{
"epoch": 0.8960279735562476,
"grad_norm": 0.76171875,
"learning_rate": 1.4450582956848924e-06,
"loss": 2.3892,
"step": 1025
},
{
"epoch": 0.8969021471889854,
"grad_norm": 0.76953125,
"learning_rate": 1.4209032835734438e-06,
"loss": 2.5031,
"step": 1026
},
{
"epoch": 0.8977763208217232,
"grad_norm": 0.75390625,
"learning_rate": 1.3969459534634328e-06,
"loss": 2.4142,
"step": 1027
},
{
"epoch": 0.898650494454461,
"grad_norm": 0.78125,
"learning_rate": 1.3731865062078852e-06,
"loss": 2.4603,
"step": 1028
},
{
"epoch": 0.8995246680871988,
"grad_norm": 0.8046875,
"learning_rate": 1.3496251410008182e-06,
"loss": 2.4121,
"step": 1029
},
{
"epoch": 0.9003988417199367,
"grad_norm": 0.76953125,
"learning_rate": 1.3262620553755795e-06,
"loss": 2.5045,
"step": 1030
},
{
"epoch": 0.9012730153526745,
"grad_norm": 0.78515625,
"learning_rate": 1.303097445203183e-06,
"loss": 2.5258,
"step": 1031
},
{
"epoch": 0.9021471889854122,
"grad_norm": 0.7578125,
"learning_rate": 1.2801315046906626e-06,
"loss": 2.4065,
"step": 1032
},
{
"epoch": 0.90302136261815,
"grad_norm": 0.7578125,
"learning_rate": 1.2573644263794483e-06,
"loss": 2.4467,
"step": 1033
},
{
"epoch": 0.9038955362508878,
"grad_norm": 0.76953125,
"learning_rate": 1.2347964011437618e-06,
"loss": 2.4969,
"step": 1034
},
{
"epoch": 0.9047697098836256,
"grad_norm": 0.74609375,
"learning_rate": 1.2124276181890021e-06,
"loss": 2.4511,
"step": 1035
},
{
"epoch": 0.9056438835163635,
"grad_norm": 0.79296875,
"learning_rate": 1.1902582650501591e-06,
"loss": 2.4106,
"step": 1036
},
{
"epoch": 0.9065180571491013,
"grad_norm": 0.765625,
"learning_rate": 1.1682885275902611e-06,
"loss": 2.498,
"step": 1037
},
{
"epoch": 0.9073922307818391,
"grad_norm": 0.75390625,
"learning_rate": 1.1465185899987797e-06,
"loss": 2.4392,
"step": 1038
},
{
"epoch": 0.9082664044145768,
"grad_norm": 0.7734375,
"learning_rate": 1.1249486347901295e-06,
"loss": 2.4253,
"step": 1039
},
{
"epoch": 0.9091405780473146,
"grad_norm": 0.75390625,
"learning_rate": 1.1035788428021082e-06,
"loss": 2.3555,
"step": 1040
},
{
"epoch": 0.9100147516800524,
"grad_norm": 0.75390625,
"learning_rate": 1.0824093931943836e-06,
"loss": 2.4441,
"step": 1041
},
{
"epoch": 0.9108889253127902,
"grad_norm": 0.73828125,
"learning_rate": 1.0614404634470081e-06,
"loss": 2.3977,
"step": 1042
},
{
"epoch": 0.9117630989455281,
"grad_norm": 0.77734375,
"learning_rate": 1.0406722293589078e-06,
"loss": 2.4909,
"step": 1043
},
{
"epoch": 0.9126372725782659,
"grad_norm": 0.77734375,
"learning_rate": 1.0201048650464256e-06,
"loss": 2.4554,
"step": 1044
},
{
"epoch": 0.9135114462110037,
"grad_norm": 1.6796875,
"learning_rate": 9.997385429418555e-07,
"loss": 2.4419,
"step": 1045
},
{
"epoch": 0.9143856198437414,
"grad_norm": 0.76953125,
"learning_rate": 9.795734337920027e-07,
"loss": 2.4223,
"step": 1046
},
{
"epoch": 0.9152597934764792,
"grad_norm": 0.78515625,
"learning_rate": 9.596097066567389e-07,
"loss": 2.4728,
"step": 1047
},
{
"epoch": 0.916133967109217,
"grad_norm": 0.80078125,
"learning_rate": 9.398475289075892e-07,
"loss": 2.4943,
"step": 1048
},
{
"epoch": 0.9170081407419549,
"grad_norm": 0.77734375,
"learning_rate": 9.202870662263551e-07,
"loss": 2.4418,
"step": 1049
},
{
"epoch": 0.9178823143746927,
"grad_norm": 0.75390625,
"learning_rate": 9.009284826036691e-07,
"loss": 2.4666,
"step": 1050
},
{
"epoch": 0.9187564880074305,
"grad_norm": 0.78125,
"learning_rate": 8.817719403376834e-07,
"loss": 2.4634,
"step": 1051
},
{
"epoch": 0.9196306616401683,
"grad_norm": 0.7734375,
"learning_rate": 8.628176000326615e-07,
"loss": 2.4589,
"step": 1052
},
{
"epoch": 0.920504835272906,
"grad_norm": 0.74609375,
"learning_rate": 8.440656205976643e-07,
"loss": 2.3923,
"step": 1053
},
{
"epoch": 0.9213790089056438,
"grad_norm": 0.765625,
"learning_rate": 8.255161592451905e-07,
"loss": 2.4739,
"step": 1054
},
{
"epoch": 0.9222531825383817,
"grad_norm": 0.7734375,
"learning_rate": 8.071693714898742e-07,
"loss": 2.5137,
"step": 1055
},
{
"epoch": 0.9231273561711195,
"grad_norm": 0.79296875,
"learning_rate": 7.890254111471845e-07,
"loss": 2.5085,
"step": 1056
},
{
"epoch": 0.9240015298038573,
"grad_norm": 0.765625,
"learning_rate": 7.710844303321197e-07,
"loss": 2.4841,
"step": 1057
},
{
"epoch": 0.9248757034365951,
"grad_norm": 0.78515625,
"learning_rate": 7.533465794579558e-07,
"loss": 2.4786,
"step": 1058
},
{
"epoch": 0.9257498770693329,
"grad_norm": 0.765625,
"learning_rate": 7.358120072349595e-07,
"loss": 2.474,
"step": 1059
},
{
"epoch": 0.9266240507020707,
"grad_norm": 0.77734375,
"learning_rate": 7.184808606691546e-07,
"loss": 2.4591,
"step": 1060
},
{
"epoch": 0.9274982243348086,
"grad_norm": 0.78515625,
"learning_rate": 7.013532850611016e-07,
"loss": 2.4539,
"step": 1061
},
{
"epoch": 0.9283723979675463,
"grad_norm": 0.7890625,
"learning_rate": 6.844294240046456e-07,
"loss": 2.4213,
"step": 1062
},
{
"epoch": 0.9292465716002841,
"grad_norm": 0.8203125,
"learning_rate": 6.677094193857508e-07,
"loss": 2.4578,
"step": 1063
},
{
"epoch": 0.9301207452330219,
"grad_norm": 0.76171875,
"learning_rate": 6.511934113812845e-07,
"loss": 2.4344,
"step": 1064
},
{
"epoch": 0.9309949188657597,
"grad_norm": 0.78515625,
"learning_rate": 6.348815384578571e-07,
"loss": 2.5178,
"step": 1065
},
{
"epoch": 0.9318690924984975,
"grad_norm": 0.765625,
"learning_rate": 6.187739373706508e-07,
"loss": 2.4604,
"step": 1066
},
{
"epoch": 0.9327432661312354,
"grad_norm": 0.78125,
"learning_rate": 6.02870743162276e-07,
"loss": 2.4583,
"step": 1067
},
{
"epoch": 0.9336174397639732,
"grad_norm": 0.76953125,
"learning_rate": 5.871720891616444e-07,
"loss": 2.4831,
"step": 1068
},
{
"epoch": 0.9344916133967109,
"grad_norm": 0.78125,
"learning_rate": 5.716781069828397e-07,
"loss": 2.452,
"step": 1069
},
{
"epoch": 0.9353657870294487,
"grad_norm": 0.7734375,
"learning_rate": 5.563889265240374e-07,
"loss": 2.4267,
"step": 1070
},
{
"epoch": 0.9362399606621865,
"grad_norm": 0.78125,
"learning_rate": 5.413046759663837e-07,
"loss": 2.4562,
"step": 1071
},
{
"epoch": 0.9371141342949243,
"grad_norm": 0.7734375,
"learning_rate": 5.264254817729436e-07,
"loss": 2.4828,
"step": 1072
},
{
"epoch": 0.9379883079276621,
"grad_norm": 0.7734375,
"learning_rate": 5.117514686876379e-07,
"loss": 2.4634,
"step": 1073
},
{
"epoch": 0.9388624815604,
"grad_norm": 0.75390625,
"learning_rate": 4.972827597341884e-07,
"loss": 2.4548,
"step": 1074
},
{
"epoch": 0.9397366551931378,
"grad_norm": 0.77734375,
"learning_rate": 4.830194762150908e-07,
"loss": 2.499,
"step": 1075
},
{
"epoch": 0.9406108288258755,
"grad_norm": 0.76171875,
"learning_rate": 4.68961737710602e-07,
"loss": 2.4884,
"step": 1076
},
{
"epoch": 0.9414850024586133,
"grad_norm": 0.796875,
"learning_rate": 4.5510966207773787e-07,
"loss": 2.4785,
"step": 1077
},
{
"epoch": 0.9423591760913511,
"grad_norm": 0.7734375,
"learning_rate": 4.4146336544927667e-07,
"loss": 2.5031,
"step": 1078
},
{
"epoch": 0.9432333497240889,
"grad_norm": 0.7890625,
"learning_rate": 4.280229622327908e-07,
"loss": 2.4423,
"step": 1079
},
{
"epoch": 0.9441075233568268,
"grad_norm": 0.765625,
"learning_rate": 4.147885651096861e-07,
"loss": 2.4773,
"step": 1080
},
{
"epoch": 0.9449816969895646,
"grad_norm": 0.77734375,
"learning_rate": 4.0176028503425835e-07,
"loss": 2.4741,
"step": 1081
},
{
"epoch": 0.9458558706223024,
"grad_norm": 0.765625,
"learning_rate": 3.889382312327716e-07,
"loss": 2.4593,
"step": 1082
},
{
"epoch": 0.9467300442550401,
"grad_norm": 0.7734375,
"learning_rate": 3.7632251120252036e-07,
"loss": 2.4922,
"step": 1083
},
{
"epoch": 0.9476042178877779,
"grad_norm": 0.74609375,
"learning_rate": 3.639132307109522e-07,
"loss": 2.3861,
"step": 1084
},
{
"epoch": 0.9484783915205157,
"grad_norm": 0.78515625,
"learning_rate": 3.517104937947657e-07,
"loss": 2.491,
"step": 1085
},
{
"epoch": 0.9493525651532536,
"grad_norm": 0.76953125,
"learning_rate": 3.3971440275904486e-07,
"loss": 2.4887,
"step": 1086
},
{
"epoch": 0.9502267387859914,
"grad_norm": 0.77734375,
"learning_rate": 3.279250581763982e-07,
"loss": 2.3822,
"step": 1087
},
{
"epoch": 0.9511009124187292,
"grad_norm": 0.78515625,
"learning_rate": 3.163425588861152e-07,
"loss": 2.4738,
"step": 1088
},
{
"epoch": 0.951975086051467,
"grad_norm": 0.765625,
"learning_rate": 3.049670019933476e-07,
"loss": 2.4591,
"step": 1089
},
{
"epoch": 0.9528492596842048,
"grad_norm": 0.7578125,
"learning_rate": 2.937984828682766e-07,
"loss": 2.4393,
"step": 1090
},
{
"epoch": 0.9537234333169425,
"grad_norm": 0.796875,
"learning_rate": 2.828370951453357e-07,
"loss": 2.4883,
"step": 1091
},
{
"epoch": 0.9545976069496804,
"grad_norm": 0.77734375,
"learning_rate": 2.72082930722406e-07,
"loss": 2.4724,
"step": 1092
},
{
"epoch": 0.9554717805824182,
"grad_norm": 0.80078125,
"learning_rate": 2.6153607976005247e-07,
"loss": 2.4764,
"step": 1093
},
{
"epoch": 0.956345954215156,
"grad_norm": 0.765625,
"learning_rate": 2.5119663068077227e-07,
"loss": 2.4236,
"step": 1094
},
{
"epoch": 0.9572201278478938,
"grad_norm": 0.765625,
"learning_rate": 2.410646701682506e-07,
"loss": 2.4893,
"step": 1095
},
{
"epoch": 0.9580943014806316,
"grad_norm": 0.7734375,
"learning_rate": 2.3114028316663373e-07,
"loss": 2.4356,
"step": 1096
},
{
"epoch": 0.9589684751133694,
"grad_norm": 0.77734375,
"learning_rate": 2.2142355287981807e-07,
"loss": 2.4625,
"step": 1097
},
{
"epoch": 0.9598426487461071,
"grad_norm": 0.7734375,
"learning_rate": 2.1191456077075122e-07,
"loss": 2.5392,
"step": 1098
},
{
"epoch": 0.960716822378845,
"grad_norm": 0.76171875,
"learning_rate": 2.0261338656074048e-07,
"loss": 2.4286,
"step": 1099
},
{
"epoch": 0.9615909960115828,
"grad_norm": 0.77734375,
"learning_rate": 1.9352010822880906e-07,
"loss": 2.4218,
"step": 1100
},
{
"epoch": 0.9624651696443206,
"grad_norm": 0.765625,
"learning_rate": 1.8463480201101334e-07,
"loss": 2.4867,
"step": 1101
},
{
"epoch": 0.9633393432770584,
"grad_norm": 0.796875,
"learning_rate": 1.759575423998211e-07,
"loss": 2.4197,
"step": 1102
},
{
"epoch": 0.9642135169097962,
"grad_norm": 0.7578125,
"learning_rate": 1.674884021434897e-07,
"loss": 2.4727,
"step": 1103
},
{
"epoch": 0.965087690542534,
"grad_norm": 0.76953125,
"learning_rate": 1.5922745224543624e-07,
"loss": 2.4637,
"step": 1104
},
{
"epoch": 0.9659618641752719,
"grad_norm": 0.78125,
"learning_rate": 1.5117476196366553e-07,
"loss": 2.4314,
"step": 1105
},
{
"epoch": 0.9668360378080096,
"grad_norm": 0.7734375,
"learning_rate": 1.4333039881017364e-07,
"loss": 2.4757,
"step": 1106
},
{
"epoch": 0.9677102114407474,
"grad_norm": 0.76953125,
"learning_rate": 1.3569442855038694e-07,
"loss": 2.4854,
"step": 1107
},
{
"epoch": 0.9685843850734852,
"grad_norm": 0.7734375,
"learning_rate": 1.2826691520262114e-07,
"loss": 2.4181,
"step": 1108
},
{
"epoch": 0.969458558706223,
"grad_norm": 0.78125,
"learning_rate": 1.2104792103752315e-07,
"loss": 2.4979,
"step": 1109
},
{
"epoch": 0.9703327323389608,
"grad_norm": 0.76953125,
"learning_rate": 1.140375065775634e-07,
"loss": 2.4244,
"step": 1110
},
{
"epoch": 0.9712069059716987,
"grad_norm": 0.79296875,
"learning_rate": 1.072357305965277e-07,
"loss": 2.4563,
"step": 1111
},
{
"epoch": 0.9720810796044365,
"grad_norm": 0.78125,
"learning_rate": 1.006426501190233e-07,
"loss": 2.4408,
"step": 1112
},
{
"epoch": 0.9729552532371742,
"grad_norm": 0.76171875,
"learning_rate": 9.425832041999871e-08,
"loss": 2.4476,
"step": 1113
},
{
"epoch": 0.973829426869912,
"grad_norm": 0.7578125,
"learning_rate": 8.808279502427741e-08,
"loss": 2.4144,
"step": 1114
},
{
"epoch": 0.9747036005026498,
"grad_norm": 0.7734375,
"learning_rate": 8.211612570611926e-08,
"loss": 2.4606,
"step": 1115
},
{
"epoch": 0.9755777741353876,
"grad_norm": 0.77734375,
"learning_rate": 7.635836248877648e-08,
"loss": 2.4508,
"step": 1116
},
{
"epoch": 0.9764519477681255,
"grad_norm": 0.78125,
"learning_rate": 7.080955364408004e-08,
"loss": 2.4857,
"step": 1117
},
{
"epoch": 0.9773261214008633,
"grad_norm": 0.765625,
"learning_rate": 6.546974569203446e-08,
"loss": 2.4381,
"step": 1118
},
{
"epoch": 0.9782002950336011,
"grad_norm": 0.78125,
"learning_rate": 6.03389834004181e-08,
"loss": 2.5202,
"step": 1119
},
{
"epoch": 0.9790744686663388,
"grad_norm": 0.7734375,
"learning_rate": 5.5417309784430716e-08,
"loss": 2.4559,
"step": 1120
},
{
"epoch": 0.9799486422990766,
"grad_norm": 0.7734375,
"learning_rate": 5.070476610630204e-08,
"loss": 2.5059,
"step": 1121
},
{
"epoch": 0.9808228159318144,
"grad_norm": 0.74609375,
"learning_rate": 4.6201391874978186e-08,
"loss": 2.3771,
"step": 1122
},
{
"epoch": 0.9816969895645523,
"grad_norm": 0.765625,
"learning_rate": 4.190722484575804e-08,
"loss": 2.4349,
"step": 1123
},
{
"epoch": 0.9825711631972901,
"grad_norm": 0.7890625,
"learning_rate": 3.7822301020004615e-08,
"loss": 2.449,
"step": 1124
},
{
"epoch": 0.9834453368300279,
"grad_norm": 0.7578125,
"learning_rate": 3.39466546448286e-08,
"loss": 2.4703,
"step": 1125
},
{
"epoch": 0.9843195104627657,
"grad_norm": 0.7578125,
"learning_rate": 3.028031821280253e-08,
"loss": 2.4228,
"step": 1126
},
{
"epoch": 0.9851936840955035,
"grad_norm": 0.76953125,
"learning_rate": 2.6823322461688726e-08,
"loss": 2.4272,
"step": 1127
},
{
"epoch": 0.9860678577282412,
"grad_norm": 0.76171875,
"learning_rate": 2.3575696374189548e-08,
"loss": 2.4173,
"step": 1128
},
{
"epoch": 0.986942031360979,
"grad_norm": 0.7734375,
"learning_rate": 2.0537467177692005e-08,
"loss": 2.4448,
"step": 1129
},
{
"epoch": 0.9878162049937169,
"grad_norm": 0.77734375,
"learning_rate": 1.770866034404295e-08,
"loss": 2.5107,
"step": 1130
},
{
"epoch": 0.9886903786264547,
"grad_norm": 0.78125,
"learning_rate": 1.5089299589346463e-08,
"loss": 2.4377,
"step": 1131
},
{
"epoch": 0.9895645522591925,
"grad_norm": 0.76953125,
"learning_rate": 1.2679406873750133e-08,
"loss": 2.4588,
"step": 1132
},
{
"epoch": 0.9904387258919303,
"grad_norm": 0.78515625,
"learning_rate": 1.0479002401264648e-08,
"loss": 2.5056,
"step": 1133
},
{
"epoch": 0.9913128995246681,
"grad_norm": 0.77734375,
"learning_rate": 8.488104619608361e-09,
"loss": 2.4797,
"step": 1134
},
{
"epoch": 0.9921870731574058,
"grad_norm": 0.765625,
"learning_rate": 6.70673022002688e-09,
"loss": 2.4936,
"step": 1135
},
{
"epoch": 0.9930612467901437,
"grad_norm": 0.7578125,
"learning_rate": 5.134894137179269e-09,
"loss": 2.4652,
"step": 1136
},
{
"epoch": 0.9939354204228815,
"grad_norm": 0.7578125,
"learning_rate": 3.772609548993722e-09,
"loss": 2.4188,
"step": 1137
},
{
"epoch": 0.9948095940556193,
"grad_norm": 0.7734375,
"learning_rate": 2.619887876564864e-09,
"loss": 2.4651,
"step": 1138
},
{
"epoch": 0.9956837676883571,
"grad_norm": 0.7578125,
"learning_rate": 1.6767387840649352e-09,
"loss": 2.4134,
"step": 1139
},
{
"epoch": 0.9965579413210949,
"grad_norm": 0.77734375,
"learning_rate": 9.431701786466462e-10,
"loss": 2.4731,
"step": 1140
},
{
"epoch": 0.9974321149538327,
"grad_norm": 0.78515625,
"learning_rate": 4.191882103904421e-10,
"loss": 2.4458,
"step": 1141
},
{
"epoch": 0.9983062885865706,
"grad_norm": 0.80078125,
"learning_rate": 1.0479727224621538e-10,
"loss": 2.4287,
"step": 1142
},
{
"epoch": 0.9991804622193083,
"grad_norm": 0.7734375,
"learning_rate": 0.0,
"loss": 2.4995,
"step": 1143
},
{
"epoch": 0.9991804622193083,
"step": 1143,
"total_flos": 1.0450600815746875e+19,
"train_loss": 2.588996330897013,
"train_runtime": 133512.32,
"train_samples_per_second": 1.097,
"train_steps_per_second": 0.009
}
],
"logging_steps": 1,
"max_steps": 1143,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 1.0450600815746875e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}