V1J34 / checkpoint-2000 /trainer_state.json
gotzmann's picture
..
f2a7e44
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.48297512678097076,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002414875633904854,
"grad_norm": 0.48872238397598267,
"learning_rate": 9.638554216867472e-07,
"loss": 2.1188,
"step": 1
},
{
"epoch": 0.0004829751267809708,
"grad_norm": 0.4883142113685608,
"learning_rate": 1.9277108433734943e-06,
"loss": 1.943,
"step": 2
},
{
"epoch": 0.0007244626901714562,
"grad_norm": 2.160808563232422,
"learning_rate": 2.891566265060241e-06,
"loss": 2.3426,
"step": 3
},
{
"epoch": 0.0009659502535619416,
"grad_norm": 0.5656324625015259,
"learning_rate": 3.855421686746989e-06,
"loss": 2.0497,
"step": 4
},
{
"epoch": 0.001207437816952427,
"grad_norm": 0.5182572603225708,
"learning_rate": 4.819277108433735e-06,
"loss": 1.9081,
"step": 5
},
{
"epoch": 0.0014489253803429123,
"grad_norm": 0.615043044090271,
"learning_rate": 5.783132530120482e-06,
"loss": 2.0246,
"step": 6
},
{
"epoch": 0.0016904129437333977,
"grad_norm": 0.47701945900917053,
"learning_rate": 6.746987951807229e-06,
"loss": 1.9923,
"step": 7
},
{
"epoch": 0.001931900507123883,
"grad_norm": 0.4645046591758728,
"learning_rate": 7.710843373493977e-06,
"loss": 1.9992,
"step": 8
},
{
"epoch": 0.0021733880705143687,
"grad_norm": 0.6710774302482605,
"learning_rate": 8.674698795180724e-06,
"loss": 1.9561,
"step": 9
},
{
"epoch": 0.002414875633904854,
"grad_norm": 0.43727195262908936,
"learning_rate": 9.63855421686747e-06,
"loss": 1.9986,
"step": 10
},
{
"epoch": 0.0026563631972953395,
"grad_norm": 0.41306453943252563,
"learning_rate": 1.0602409638554219e-05,
"loss": 1.8657,
"step": 11
},
{
"epoch": 0.0028978507606858247,
"grad_norm": 0.496465802192688,
"learning_rate": 1.1566265060240964e-05,
"loss": 1.9444,
"step": 12
},
{
"epoch": 0.0031393383240763103,
"grad_norm": 0.40364280343055725,
"learning_rate": 1.2530120481927712e-05,
"loss": 2.0184,
"step": 13
},
{
"epoch": 0.0033808258874667954,
"grad_norm": 0.4289240539073944,
"learning_rate": 1.3493975903614458e-05,
"loss": 1.9886,
"step": 14
},
{
"epoch": 0.003622313450857281,
"grad_norm": 0.3964898884296417,
"learning_rate": 1.4457831325301207e-05,
"loss": 1.8049,
"step": 15
},
{
"epoch": 0.003863801014247766,
"grad_norm": 0.39897167682647705,
"learning_rate": 1.5421686746987955e-05,
"loss": 1.9805,
"step": 16
},
{
"epoch": 0.004105288577638252,
"grad_norm": 0.4459080696105957,
"learning_rate": 1.63855421686747e-05,
"loss": 1.9905,
"step": 17
},
{
"epoch": 0.004346776141028737,
"grad_norm": 0.7771973609924316,
"learning_rate": 1.7349397590361448e-05,
"loss": 2.2652,
"step": 18
},
{
"epoch": 0.004588263704419222,
"grad_norm": 0.4256933629512787,
"learning_rate": 1.8313253012048194e-05,
"loss": 2.0247,
"step": 19
},
{
"epoch": 0.004829751267809708,
"grad_norm": 0.41948211193084717,
"learning_rate": 1.927710843373494e-05,
"loss": 1.9011,
"step": 20
},
{
"epoch": 0.005071238831200193,
"grad_norm": 0.3880179524421692,
"learning_rate": 2.0240963855421687e-05,
"loss": 1.6799,
"step": 21
},
{
"epoch": 0.005312726394590679,
"grad_norm": 0.39275649189949036,
"learning_rate": 2.1204819277108437e-05,
"loss": 1.916,
"step": 22
},
{
"epoch": 0.005554213957981164,
"grad_norm": 0.35941553115844727,
"learning_rate": 2.2168674698795184e-05,
"loss": 1.7779,
"step": 23
},
{
"epoch": 0.005795701521371649,
"grad_norm": 0.4126398265361786,
"learning_rate": 2.3132530120481927e-05,
"loss": 2.004,
"step": 24
},
{
"epoch": 0.006037189084762135,
"grad_norm": 0.3780952990055084,
"learning_rate": 2.409638554216868e-05,
"loss": 1.8459,
"step": 25
},
{
"epoch": 0.0062786766481526205,
"grad_norm": 0.3541395366191864,
"learning_rate": 2.5060240963855423e-05,
"loss": 1.7157,
"step": 26
},
{
"epoch": 0.006520164211543105,
"grad_norm": 0.4550764858722687,
"learning_rate": 2.602409638554217e-05,
"loss": 1.8738,
"step": 27
},
{
"epoch": 0.006761651774933591,
"grad_norm": 0.4110875725746155,
"learning_rate": 2.6987951807228917e-05,
"loss": 1.7607,
"step": 28
},
{
"epoch": 0.0070031393383240765,
"grad_norm": 0.398453027009964,
"learning_rate": 2.7951807228915666e-05,
"loss": 1.9628,
"step": 29
},
{
"epoch": 0.007244626901714562,
"grad_norm": 0.3572748005390167,
"learning_rate": 2.8915662650602413e-05,
"loss": 1.775,
"step": 30
},
{
"epoch": 0.007486114465105047,
"grad_norm": 0.38363558053970337,
"learning_rate": 2.9879518072289156e-05,
"loss": 1.855,
"step": 31
},
{
"epoch": 0.007727602028495532,
"grad_norm": 0.392665296792984,
"learning_rate": 3.084337349397591e-05,
"loss": 2.0708,
"step": 32
},
{
"epoch": 0.007969089591886018,
"grad_norm": 0.42784029245376587,
"learning_rate": 3.180722891566265e-05,
"loss": 2.0002,
"step": 33
},
{
"epoch": 0.008210577155276504,
"grad_norm": 0.39450863003730774,
"learning_rate": 3.27710843373494e-05,
"loss": 1.7978,
"step": 34
},
{
"epoch": 0.00845206471866699,
"grad_norm": 0.37916016578674316,
"learning_rate": 3.373493975903615e-05,
"loss": 1.7597,
"step": 35
},
{
"epoch": 0.008693552282057475,
"grad_norm": 0.3838157653808594,
"learning_rate": 3.4698795180722896e-05,
"loss": 1.7366,
"step": 36
},
{
"epoch": 0.008935039845447959,
"grad_norm": 0.39187654852867126,
"learning_rate": 3.566265060240964e-05,
"loss": 1.7743,
"step": 37
},
{
"epoch": 0.009176527408838444,
"grad_norm": 0.4216479957103729,
"learning_rate": 3.662650602409639e-05,
"loss": 1.9526,
"step": 38
},
{
"epoch": 0.00941801497222893,
"grad_norm": 0.3791981637477875,
"learning_rate": 3.759036144578314e-05,
"loss": 1.8637,
"step": 39
},
{
"epoch": 0.009659502535619416,
"grad_norm": 0.4517281949520111,
"learning_rate": 3.855421686746988e-05,
"loss": 1.9789,
"step": 40
},
{
"epoch": 0.009900990099009901,
"grad_norm": 0.3904320001602173,
"learning_rate": 3.9518072289156625e-05,
"loss": 1.9162,
"step": 41
},
{
"epoch": 0.010142477662400387,
"grad_norm": 0.39694979786872864,
"learning_rate": 4.0481927710843375e-05,
"loss": 2.0246,
"step": 42
},
{
"epoch": 0.010383965225790872,
"grad_norm": 0.39392992854118347,
"learning_rate": 4.1445783132530125e-05,
"loss": 1.8925,
"step": 43
},
{
"epoch": 0.010625452789181358,
"grad_norm": 0.3753025233745575,
"learning_rate": 4.2409638554216875e-05,
"loss": 1.777,
"step": 44
},
{
"epoch": 0.010866940352571842,
"grad_norm": 0.35296690464019775,
"learning_rate": 4.337349397590362e-05,
"loss": 1.7254,
"step": 45
},
{
"epoch": 0.011108427915962327,
"grad_norm": 0.39575520157814026,
"learning_rate": 4.433734939759037e-05,
"loss": 1.819,
"step": 46
},
{
"epoch": 0.011349915479352813,
"grad_norm": 0.415618896484375,
"learning_rate": 4.530120481927712e-05,
"loss": 1.9398,
"step": 47
},
{
"epoch": 0.011591403042743299,
"grad_norm": 0.3653118908405304,
"learning_rate": 4.6265060240963854e-05,
"loss": 1.7664,
"step": 48
},
{
"epoch": 0.011832890606133784,
"grad_norm": 0.38401493430137634,
"learning_rate": 4.7228915662650604e-05,
"loss": 1.9299,
"step": 49
},
{
"epoch": 0.01207437816952427,
"grad_norm": 0.4112469255924225,
"learning_rate": 4.819277108433736e-05,
"loss": 1.9618,
"step": 50
},
{
"epoch": 0.012315865732914755,
"grad_norm": 0.39517056941986084,
"learning_rate": 4.91566265060241e-05,
"loss": 2.0678,
"step": 51
},
{
"epoch": 0.012557353296305241,
"grad_norm": 0.38852378726005554,
"learning_rate": 5.012048192771085e-05,
"loss": 1.9389,
"step": 52
},
{
"epoch": 0.012798840859695725,
"grad_norm": 0.392365425825119,
"learning_rate": 5.108433734939759e-05,
"loss": 1.872,
"step": 53
},
{
"epoch": 0.01304032842308621,
"grad_norm": 0.40039297938346863,
"learning_rate": 5.204819277108434e-05,
"loss": 1.9234,
"step": 54
},
{
"epoch": 0.013281815986476696,
"grad_norm": 0.37631353735923767,
"learning_rate": 5.301204819277109e-05,
"loss": 1.8483,
"step": 55
},
{
"epoch": 0.013523303549867182,
"grad_norm": 0.3847208321094513,
"learning_rate": 5.397590361445783e-05,
"loss": 1.7396,
"step": 56
},
{
"epoch": 0.013764791113257667,
"grad_norm": 0.43836677074432373,
"learning_rate": 5.493975903614458e-05,
"loss": 2.1202,
"step": 57
},
{
"epoch": 0.014006278676648153,
"grad_norm": 0.4151008427143097,
"learning_rate": 5.590361445783133e-05,
"loss": 1.9056,
"step": 58
},
{
"epoch": 0.014247766240038639,
"grad_norm": 0.4057491719722748,
"learning_rate": 5.6867469879518076e-05,
"loss": 1.8731,
"step": 59
},
{
"epoch": 0.014489253803429124,
"grad_norm": 0.39896196126937866,
"learning_rate": 5.7831325301204826e-05,
"loss": 1.7901,
"step": 60
},
{
"epoch": 0.014730741366819608,
"grad_norm": 0.5027028322219849,
"learning_rate": 5.8795180722891576e-05,
"loss": 2.176,
"step": 61
},
{
"epoch": 0.014972228930210094,
"grad_norm": 0.41533949971199036,
"learning_rate": 5.975903614457831e-05,
"loss": 1.8349,
"step": 62
},
{
"epoch": 0.01521371649360058,
"grad_norm": 0.41627174615859985,
"learning_rate": 6.072289156626506e-05,
"loss": 1.8164,
"step": 63
},
{
"epoch": 0.015455204056991065,
"grad_norm": 0.3680180311203003,
"learning_rate": 6.168674698795182e-05,
"loss": 1.7825,
"step": 64
},
{
"epoch": 0.01569669162038155,
"grad_norm": 0.3980069160461426,
"learning_rate": 6.265060240963856e-05,
"loss": 1.8251,
"step": 65
},
{
"epoch": 0.015938179183772036,
"grad_norm": 0.3967473804950714,
"learning_rate": 6.36144578313253e-05,
"loss": 1.8168,
"step": 66
},
{
"epoch": 0.01617966674716252,
"grad_norm": 0.3991287052631378,
"learning_rate": 6.457831325301206e-05,
"loss": 1.8828,
"step": 67
},
{
"epoch": 0.016421154310553007,
"grad_norm": 0.4125327467918396,
"learning_rate": 6.55421686746988e-05,
"loss": 1.848,
"step": 68
},
{
"epoch": 0.016662641873943493,
"grad_norm": 0.37583857774734497,
"learning_rate": 6.650602409638555e-05,
"loss": 1.7656,
"step": 69
},
{
"epoch": 0.01690412943733398,
"grad_norm": 0.43856287002563477,
"learning_rate": 6.74698795180723e-05,
"loss": 1.9077,
"step": 70
},
{
"epoch": 0.017145617000724464,
"grad_norm": 0.39317071437835693,
"learning_rate": 6.843373493975903e-05,
"loss": 1.8317,
"step": 71
},
{
"epoch": 0.01738710456411495,
"grad_norm": 0.3993190824985504,
"learning_rate": 6.939759036144579e-05,
"loss": 1.8451,
"step": 72
},
{
"epoch": 0.017628592127505432,
"grad_norm": 0.3683207333087921,
"learning_rate": 7.036144578313253e-05,
"loss": 1.7778,
"step": 73
},
{
"epoch": 0.017870079690895917,
"grad_norm": 0.38704434037208557,
"learning_rate": 7.132530120481928e-05,
"loss": 1.8159,
"step": 74
},
{
"epoch": 0.018111567254286403,
"grad_norm": 0.42196622490882874,
"learning_rate": 7.228915662650603e-05,
"loss": 2.1045,
"step": 75
},
{
"epoch": 0.01835305481767689,
"grad_norm": 0.3692149817943573,
"learning_rate": 7.325301204819278e-05,
"loss": 1.7807,
"step": 76
},
{
"epoch": 0.018594542381067374,
"grad_norm": 0.3880510926246643,
"learning_rate": 7.421686746987952e-05,
"loss": 1.7362,
"step": 77
},
{
"epoch": 0.01883602994445786,
"grad_norm": 0.379742830991745,
"learning_rate": 7.518072289156628e-05,
"loss": 1.8806,
"step": 78
},
{
"epoch": 0.019077517507848345,
"grad_norm": 0.3501541018486023,
"learning_rate": 7.614457831325302e-05,
"loss": 1.6607,
"step": 79
},
{
"epoch": 0.01931900507123883,
"grad_norm": 0.3936968743801117,
"learning_rate": 7.710843373493976e-05,
"loss": 1.9365,
"step": 80
},
{
"epoch": 0.019560492634629317,
"grad_norm": 0.3812267780303955,
"learning_rate": 7.807228915662652e-05,
"loss": 1.8093,
"step": 81
},
{
"epoch": 0.019801980198019802,
"grad_norm": 0.3729088604450226,
"learning_rate": 7.903614457831325e-05,
"loss": 1.7508,
"step": 82
},
{
"epoch": 0.020043467761410288,
"grad_norm": 0.36335960030555725,
"learning_rate": 8e-05,
"loss": 1.7563,
"step": 83
},
{
"epoch": 0.020284955324800773,
"grad_norm": 0.3932444155216217,
"learning_rate": 7.999998801313446e-05,
"loss": 1.9381,
"step": 84
},
{
"epoch": 0.02052644288819126,
"grad_norm": 0.37464866042137146,
"learning_rate": 7.9999952052545e-05,
"loss": 1.897,
"step": 85
},
{
"epoch": 0.020767930451581745,
"grad_norm": 0.5091702938079834,
"learning_rate": 7.99998921182532e-05,
"loss": 2.0178,
"step": 86
},
{
"epoch": 0.02100941801497223,
"grad_norm": 0.35622596740722656,
"learning_rate": 7.999980821029496e-05,
"loss": 1.7142,
"step": 87
},
{
"epoch": 0.021250905578362716,
"grad_norm": 0.35853254795074463,
"learning_rate": 7.999970032872057e-05,
"loss": 1.727,
"step": 88
},
{
"epoch": 0.021492393141753198,
"grad_norm": 0.37769579887390137,
"learning_rate": 7.99995684735947e-05,
"loss": 1.8811,
"step": 89
},
{
"epoch": 0.021733880705143684,
"grad_norm": 0.3953562378883362,
"learning_rate": 7.999941264499637e-05,
"loss": 1.8882,
"step": 90
},
{
"epoch": 0.02197536826853417,
"grad_norm": 0.3842523992061615,
"learning_rate": 7.999923284301897e-05,
"loss": 1.9009,
"step": 91
},
{
"epoch": 0.022216855831924655,
"grad_norm": 0.4005531072616577,
"learning_rate": 7.999902906777028e-05,
"loss": 2.0613,
"step": 92
},
{
"epoch": 0.02245834339531514,
"grad_norm": 0.37064820528030396,
"learning_rate": 7.999880131937242e-05,
"loss": 1.9517,
"step": 93
},
{
"epoch": 0.022699830958705626,
"grad_norm": 0.372097373008728,
"learning_rate": 7.999854959796187e-05,
"loss": 1.8402,
"step": 94
},
{
"epoch": 0.02294131852209611,
"grad_norm": 0.34422364830970764,
"learning_rate": 7.999827390368954e-05,
"loss": 1.754,
"step": 95
},
{
"epoch": 0.023182806085486597,
"grad_norm": 0.4320511817932129,
"learning_rate": 7.999797423672062e-05,
"loss": 1.9835,
"step": 96
},
{
"epoch": 0.023424293648877083,
"grad_norm": 0.34041526913642883,
"learning_rate": 7.999765059723475e-05,
"loss": 1.593,
"step": 97
},
{
"epoch": 0.02366578121226757,
"grad_norm": 0.3749473989009857,
"learning_rate": 7.999730298542589e-05,
"loss": 1.9249,
"step": 98
},
{
"epoch": 0.023907268775658054,
"grad_norm": 0.37020304799079895,
"learning_rate": 7.999693140150238e-05,
"loss": 1.9598,
"step": 99
},
{
"epoch": 0.02414875633904854,
"grad_norm": 0.3638790249824524,
"learning_rate": 7.99965358456869e-05,
"loss": 1.7858,
"step": 100
},
{
"epoch": 0.024390243902439025,
"grad_norm": 0.35202088952064514,
"learning_rate": 7.999611631821657e-05,
"loss": 1.8988,
"step": 101
},
{
"epoch": 0.02463173146582951,
"grad_norm": 0.3286641538143158,
"learning_rate": 7.999567281934278e-05,
"loss": 1.73,
"step": 102
},
{
"epoch": 0.024873219029219996,
"grad_norm": 0.3850080668926239,
"learning_rate": 7.99952053493314e-05,
"loss": 1.8341,
"step": 103
},
{
"epoch": 0.025114706592610482,
"grad_norm": 0.354960560798645,
"learning_rate": 7.999471390846253e-05,
"loss": 2.0089,
"step": 104
},
{
"epoch": 0.025356194156000968,
"grad_norm": 0.3476881682872772,
"learning_rate": 7.999419849703078e-05,
"loss": 1.833,
"step": 105
},
{
"epoch": 0.02559768171939145,
"grad_norm": 0.35317471623420715,
"learning_rate": 7.999365911534503e-05,
"loss": 1.8344,
"step": 106
},
{
"epoch": 0.025839169282781935,
"grad_norm": 0.3764777183532715,
"learning_rate": 7.999309576372855e-05,
"loss": 1.9944,
"step": 107
},
{
"epoch": 0.02608065684617242,
"grad_norm": 0.3360855281352997,
"learning_rate": 7.999250844251898e-05,
"loss": 1.7526,
"step": 108
},
{
"epoch": 0.026322144409562907,
"grad_norm": 0.37262898683547974,
"learning_rate": 7.999189715206832e-05,
"loss": 1.7409,
"step": 109
},
{
"epoch": 0.026563631972953392,
"grad_norm": 0.34567996859550476,
"learning_rate": 7.999126189274298e-05,
"loss": 1.76,
"step": 110
},
{
"epoch": 0.026805119536343878,
"grad_norm": 0.37824591994285583,
"learning_rate": 7.999060266492366e-05,
"loss": 1.9955,
"step": 111
},
{
"epoch": 0.027046607099734363,
"grad_norm": 0.3456074297428131,
"learning_rate": 7.998991946900549e-05,
"loss": 1.6786,
"step": 112
},
{
"epoch": 0.02728809466312485,
"grad_norm": 0.40303823351860046,
"learning_rate": 7.998921230539792e-05,
"loss": 2.009,
"step": 113
},
{
"epoch": 0.027529582226515335,
"grad_norm": 0.37486642599105835,
"learning_rate": 7.998848117452479e-05,
"loss": 2.0262,
"step": 114
},
{
"epoch": 0.02777106978990582,
"grad_norm": 0.35351452231407166,
"learning_rate": 7.998772607682431e-05,
"loss": 1.8546,
"step": 115
},
{
"epoch": 0.028012557353296306,
"grad_norm": 0.33875027298927307,
"learning_rate": 7.998694701274901e-05,
"loss": 1.766,
"step": 116
},
{
"epoch": 0.02825404491668679,
"grad_norm": 0.35830602049827576,
"learning_rate": 7.998614398276586e-05,
"loss": 1.6792,
"step": 117
},
{
"epoch": 0.028495532480077277,
"grad_norm": 0.33689743280410767,
"learning_rate": 7.998531698735611e-05,
"loss": 1.8919,
"step": 118
},
{
"epoch": 0.028737020043467763,
"grad_norm": 0.33229848742485046,
"learning_rate": 7.998446602701544e-05,
"loss": 1.8482,
"step": 119
},
{
"epoch": 0.028978507606858248,
"grad_norm": 0.3552752733230591,
"learning_rate": 7.998359110225386e-05,
"loss": 1.8519,
"step": 120
},
{
"epoch": 0.029219995170248734,
"grad_norm": 0.3789513111114502,
"learning_rate": 7.998269221359575e-05,
"loss": 1.7455,
"step": 121
},
{
"epoch": 0.029461482733639216,
"grad_norm": 0.32534146308898926,
"learning_rate": 7.998176936157986e-05,
"loss": 1.7738,
"step": 122
},
{
"epoch": 0.0297029702970297,
"grad_norm": 0.37436211109161377,
"learning_rate": 7.998082254675929e-05,
"loss": 1.8552,
"step": 123
},
{
"epoch": 0.029944457860420187,
"grad_norm": 0.3442078232765198,
"learning_rate": 7.99798517697015e-05,
"loss": 1.7527,
"step": 124
},
{
"epoch": 0.030185945423810673,
"grad_norm": 0.36838826537132263,
"learning_rate": 7.997885703098833e-05,
"loss": 1.8089,
"step": 125
},
{
"epoch": 0.03042743298720116,
"grad_norm": 0.3229195475578308,
"learning_rate": 7.997783833121595e-05,
"loss": 1.7343,
"step": 126
},
{
"epoch": 0.030668920550591644,
"grad_norm": 0.35546913743019104,
"learning_rate": 7.997679567099495e-05,
"loss": 1.8091,
"step": 127
},
{
"epoch": 0.03091040811398213,
"grad_norm": 0.3430229425430298,
"learning_rate": 7.99757290509502e-05,
"loss": 1.731,
"step": 128
},
{
"epoch": 0.031151895677372615,
"grad_norm": 0.34878894686698914,
"learning_rate": 7.997463847172099e-05,
"loss": 1.8177,
"step": 129
},
{
"epoch": 0.0313933832407631,
"grad_norm": 0.3356412649154663,
"learning_rate": 7.997352393396094e-05,
"loss": 1.8495,
"step": 130
},
{
"epoch": 0.031634870804153586,
"grad_norm": 0.3388964533805847,
"learning_rate": 7.997238543833807e-05,
"loss": 1.7708,
"step": 131
},
{
"epoch": 0.03187635836754407,
"grad_norm": 0.3642221689224243,
"learning_rate": 7.99712229855347e-05,
"loss": 1.8336,
"step": 132
},
{
"epoch": 0.03211784593093456,
"grad_norm": 0.3364923298358917,
"learning_rate": 7.997003657624755e-05,
"loss": 1.7808,
"step": 133
},
{
"epoch": 0.03235933349432504,
"grad_norm": 0.35074931383132935,
"learning_rate": 7.996882621118769e-05,
"loss": 1.8519,
"step": 134
},
{
"epoch": 0.03260082105771553,
"grad_norm": 0.3484658896923065,
"learning_rate": 7.996759189108053e-05,
"loss": 1.8158,
"step": 135
},
{
"epoch": 0.032842308621106014,
"grad_norm": 0.32097330689430237,
"learning_rate": 7.996633361666587e-05,
"loss": 1.7388,
"step": 136
},
{
"epoch": 0.0330837961844965,
"grad_norm": 0.3958728611469269,
"learning_rate": 7.996505138869783e-05,
"loss": 1.9125,
"step": 137
},
{
"epoch": 0.033325283747886986,
"grad_norm": 0.3487996757030487,
"learning_rate": 7.996374520794492e-05,
"loss": 1.9042,
"step": 138
},
{
"epoch": 0.03356677131127747,
"grad_norm": 0.38680174946784973,
"learning_rate": 7.996241507518998e-05,
"loss": 1.9944,
"step": 139
},
{
"epoch": 0.03380825887466796,
"grad_norm": 0.32666078209877014,
"learning_rate": 7.996106099123022e-05,
"loss": 1.6428,
"step": 140
},
{
"epoch": 0.03404974643805844,
"grad_norm": 0.3395536541938782,
"learning_rate": 7.995968295687719e-05,
"loss": 1.8936,
"step": 141
},
{
"epoch": 0.03429123400144893,
"grad_norm": 0.3326514661312103,
"learning_rate": 7.995828097295685e-05,
"loss": 1.7893,
"step": 142
},
{
"epoch": 0.034532721564839414,
"grad_norm": 0.35848790407180786,
"learning_rate": 7.995685504030941e-05,
"loss": 1.9426,
"step": 143
},
{
"epoch": 0.0347742091282299,
"grad_norm": 0.3663111925125122,
"learning_rate": 7.995540515978952e-05,
"loss": 1.8951,
"step": 144
},
{
"epoch": 0.035015696691620385,
"grad_norm": 0.33936575055122375,
"learning_rate": 7.995393133226616e-05,
"loss": 1.8215,
"step": 145
},
{
"epoch": 0.035257184255010864,
"grad_norm": 0.33017027378082275,
"learning_rate": 7.995243355862266e-05,
"loss": 1.8301,
"step": 146
},
{
"epoch": 0.03549867181840135,
"grad_norm": 0.33415642380714417,
"learning_rate": 7.99509118397567e-05,
"loss": 1.8482,
"step": 147
},
{
"epoch": 0.035740159381791835,
"grad_norm": 0.35916557908058167,
"learning_rate": 7.99493661765803e-05,
"loss": 1.8992,
"step": 148
},
{
"epoch": 0.03598164694518232,
"grad_norm": 0.3150824308395386,
"learning_rate": 7.994779657001984e-05,
"loss": 1.7173,
"step": 149
},
{
"epoch": 0.036223134508572806,
"grad_norm": 0.35707587003707886,
"learning_rate": 7.994620302101607e-05,
"loss": 2.0529,
"step": 150
},
{
"epoch": 0.03646462207196329,
"grad_norm": 0.30455395579338074,
"learning_rate": 7.994458553052406e-05,
"loss": 1.5871,
"step": 151
},
{
"epoch": 0.03670610963535378,
"grad_norm": 0.3313930034637451,
"learning_rate": 7.994294409951326e-05,
"loss": 1.7102,
"step": 152
},
{
"epoch": 0.03694759719874426,
"grad_norm": 0.3556051254272461,
"learning_rate": 7.994127872896744e-05,
"loss": 1.9564,
"step": 153
},
{
"epoch": 0.03718908476213475,
"grad_norm": 0.39041200280189514,
"learning_rate": 7.993958941988472e-05,
"loss": 2.0505,
"step": 154
},
{
"epoch": 0.037430572325525234,
"grad_norm": 0.35395804047584534,
"learning_rate": 7.993787617327758e-05,
"loss": 1.9035,
"step": 155
},
{
"epoch": 0.03767205988891572,
"grad_norm": 0.32132115960121155,
"learning_rate": 7.993613899017286e-05,
"loss": 1.8414,
"step": 156
},
{
"epoch": 0.037913547452306205,
"grad_norm": 0.32500675320625305,
"learning_rate": 7.99343778716117e-05,
"loss": 1.4969,
"step": 157
},
{
"epoch": 0.03815503501569669,
"grad_norm": 0.32838916778564453,
"learning_rate": 7.993259281864964e-05,
"loss": 1.7858,
"step": 158
},
{
"epoch": 0.038396522579087176,
"grad_norm": 0.3455624580383301,
"learning_rate": 7.993078383235653e-05,
"loss": 1.8199,
"step": 159
},
{
"epoch": 0.03863801014247766,
"grad_norm": 0.3421010375022888,
"learning_rate": 7.992895091381656e-05,
"loss": 1.8818,
"step": 160
},
{
"epoch": 0.03887949770586815,
"grad_norm": 0.360836386680603,
"learning_rate": 7.99270940641283e-05,
"loss": 1.9759,
"step": 161
},
{
"epoch": 0.03912098526925863,
"grad_norm": 0.32319512963294983,
"learning_rate": 7.992521328440463e-05,
"loss": 1.6659,
"step": 162
},
{
"epoch": 0.03936247283264912,
"grad_norm": 0.33566924929618835,
"learning_rate": 7.992330857577278e-05,
"loss": 1.7625,
"step": 163
},
{
"epoch": 0.039603960396039604,
"grad_norm": 0.3267430365085602,
"learning_rate": 7.992137993937434e-05,
"loss": 1.7359,
"step": 164
},
{
"epoch": 0.03984544795943009,
"grad_norm": 0.37398430705070496,
"learning_rate": 7.991942737636519e-05,
"loss": 2.0229,
"step": 165
},
{
"epoch": 0.040086935522820576,
"grad_norm": 0.3316766023635864,
"learning_rate": 7.991745088791563e-05,
"loss": 1.8624,
"step": 166
},
{
"epoch": 0.04032842308621106,
"grad_norm": 0.3504400849342346,
"learning_rate": 7.991545047521022e-05,
"loss": 2.0128,
"step": 167
},
{
"epoch": 0.04056991064960155,
"grad_norm": 0.3182665705680847,
"learning_rate": 7.991342613944791e-05,
"loss": 1.5942,
"step": 168
},
{
"epoch": 0.04081139821299203,
"grad_norm": 0.3529200851917267,
"learning_rate": 7.991137788184198e-05,
"loss": 1.9559,
"step": 169
},
{
"epoch": 0.04105288577638252,
"grad_norm": 0.35057875514030457,
"learning_rate": 7.990930570362002e-05,
"loss": 1.8836,
"step": 170
},
{
"epoch": 0.041294373339773004,
"grad_norm": 0.3297763764858246,
"learning_rate": 7.990720960602398e-05,
"loss": 1.8221,
"step": 171
},
{
"epoch": 0.04153586090316349,
"grad_norm": 0.3292389512062073,
"learning_rate": 7.990508959031015e-05,
"loss": 1.7315,
"step": 172
},
{
"epoch": 0.041777348466553975,
"grad_norm": 0.3380139172077179,
"learning_rate": 7.990294565774916e-05,
"loss": 1.7487,
"step": 173
},
{
"epoch": 0.04201883602994446,
"grad_norm": 0.3513992130756378,
"learning_rate": 7.990077780962593e-05,
"loss": 1.7758,
"step": 174
},
{
"epoch": 0.042260323593334946,
"grad_norm": 0.3371720612049103,
"learning_rate": 7.989858604723976e-05,
"loss": 1.7694,
"step": 175
},
{
"epoch": 0.04250181115672543,
"grad_norm": 0.3474743366241455,
"learning_rate": 7.989637037190427e-05,
"loss": 1.8237,
"step": 176
},
{
"epoch": 0.04274329872011592,
"grad_norm": 0.3570946455001831,
"learning_rate": 7.989413078494742e-05,
"loss": 1.852,
"step": 177
},
{
"epoch": 0.042984786283506396,
"grad_norm": 0.4105489253997803,
"learning_rate": 7.989186728771147e-05,
"loss": 2.0145,
"step": 178
},
{
"epoch": 0.04322627384689688,
"grad_norm": 0.31396129727363586,
"learning_rate": 7.988957988155305e-05,
"loss": 1.699,
"step": 179
},
{
"epoch": 0.04346776141028737,
"grad_norm": 0.33446812629699707,
"learning_rate": 7.98872685678431e-05,
"loss": 1.8951,
"step": 180
},
{
"epoch": 0.04370924897367785,
"grad_norm": 0.3372074067592621,
"learning_rate": 7.988493334796688e-05,
"loss": 1.751,
"step": 181
},
{
"epoch": 0.04395073653706834,
"grad_norm": 0.3188993036746979,
"learning_rate": 7.988257422332398e-05,
"loss": 1.6716,
"step": 182
},
{
"epoch": 0.044192224100458824,
"grad_norm": 0.32916897535324097,
"learning_rate": 7.988019119532834e-05,
"loss": 1.7562,
"step": 183
},
{
"epoch": 0.04443371166384931,
"grad_norm": 0.5125882029533386,
"learning_rate": 7.987778426540821e-05,
"loss": 2.423,
"step": 184
},
{
"epoch": 0.044675199227239795,
"grad_norm": 0.34698373079299927,
"learning_rate": 7.987535343500619e-05,
"loss": 1.8062,
"step": 185
},
{
"epoch": 0.04491668679063028,
"grad_norm": 0.349882036447525,
"learning_rate": 7.987289870557914e-05,
"loss": 1.9638,
"step": 186
},
{
"epoch": 0.045158174354020766,
"grad_norm": 0.34001484513282776,
"learning_rate": 7.98704200785983e-05,
"loss": 1.8865,
"step": 187
},
{
"epoch": 0.04539966191741125,
"grad_norm": 0.34518545866012573,
"learning_rate": 7.986791755554923e-05,
"loss": 1.7105,
"step": 188
},
{
"epoch": 0.04564114948080174,
"grad_norm": 0.3248199224472046,
"learning_rate": 7.986539113793179e-05,
"loss": 1.8116,
"step": 189
},
{
"epoch": 0.04588263704419222,
"grad_norm": 0.36076945066452026,
"learning_rate": 7.986284082726017e-05,
"loss": 1.8027,
"step": 190
},
{
"epoch": 0.04612412460758271,
"grad_norm": 0.34199753403663635,
"learning_rate": 7.98602666250629e-05,
"loss": 1.815,
"step": 191
},
{
"epoch": 0.046365612170973194,
"grad_norm": 0.35182511806488037,
"learning_rate": 7.985766853288278e-05,
"loss": 1.8876,
"step": 192
},
{
"epoch": 0.04660709973436368,
"grad_norm": 0.31644105911254883,
"learning_rate": 7.9855046552277e-05,
"loss": 1.807,
"step": 193
},
{
"epoch": 0.046848587297754166,
"grad_norm": 0.34520867466926575,
"learning_rate": 7.985240068481698e-05,
"loss": 1.8446,
"step": 194
},
{
"epoch": 0.04709007486114465,
"grad_norm": 0.33563631772994995,
"learning_rate": 7.984973093208852e-05,
"loss": 1.8509,
"step": 195
},
{
"epoch": 0.04733156242453514,
"grad_norm": 0.3410038352012634,
"learning_rate": 7.984703729569175e-05,
"loss": 2.0203,
"step": 196
},
{
"epoch": 0.04757304998792562,
"grad_norm": 0.3287442624568939,
"learning_rate": 7.984431977724105e-05,
"loss": 1.6625,
"step": 197
},
{
"epoch": 0.04781453755131611,
"grad_norm": 0.3447628915309906,
"learning_rate": 7.984157837836515e-05,
"loss": 2.0291,
"step": 198
},
{
"epoch": 0.048056025114706594,
"grad_norm": 0.31992051005363464,
"learning_rate": 7.983881310070709e-05,
"loss": 1.682,
"step": 199
},
{
"epoch": 0.04829751267809708,
"grad_norm": 0.3539101779460907,
"learning_rate": 7.983602394592422e-05,
"loss": 2.0146,
"step": 200
},
{
"epoch": 0.048539000241487565,
"grad_norm": 0.3836063742637634,
"learning_rate": 7.983321091568821e-05,
"loss": 1.6322,
"step": 201
},
{
"epoch": 0.04878048780487805,
"grad_norm": 0.3384498953819275,
"learning_rate": 7.983037401168503e-05,
"loss": 1.8082,
"step": 202
},
{
"epoch": 0.049021975368268536,
"grad_norm": 0.3204689621925354,
"learning_rate": 7.982751323561493e-05,
"loss": 1.7478,
"step": 203
},
{
"epoch": 0.04926346293165902,
"grad_norm": 0.363129585981369,
"learning_rate": 7.982462858919255e-05,
"loss": 1.8098,
"step": 204
},
{
"epoch": 0.04950495049504951,
"grad_norm": 0.3273480534553528,
"learning_rate": 7.982172007414675e-05,
"loss": 1.8649,
"step": 205
},
{
"epoch": 0.04974643805843999,
"grad_norm": 0.32456788420677185,
"learning_rate": 7.981878769222072e-05,
"loss": 1.773,
"step": 206
},
{
"epoch": 0.04998792562183048,
"grad_norm": 0.34132328629493713,
"learning_rate": 7.981583144517198e-05,
"loss": 1.7702,
"step": 207
},
{
"epoch": 0.050229413185220964,
"grad_norm": 0.3215339779853821,
"learning_rate": 7.981285133477233e-05,
"loss": 1.6318,
"step": 208
},
{
"epoch": 0.05047090074861145,
"grad_norm": 0.3282195031642914,
"learning_rate": 7.980984736280789e-05,
"loss": 1.7331,
"step": 209
},
{
"epoch": 0.050712388312001935,
"grad_norm": 0.3406447172164917,
"learning_rate": 7.980681953107905e-05,
"loss": 1.837,
"step": 210
},
{
"epoch": 0.050953875875392414,
"grad_norm": 0.3377143442630768,
"learning_rate": 7.980376784140055e-05,
"loss": 1.8457,
"step": 211
},
{
"epoch": 0.0511953634387829,
"grad_norm": 0.3229312002658844,
"learning_rate": 7.980069229560137e-05,
"loss": 1.7076,
"step": 212
},
{
"epoch": 0.051436851002173385,
"grad_norm": 0.3151211142539978,
"learning_rate": 7.979759289552484e-05,
"loss": 1.7162,
"step": 213
},
{
"epoch": 0.05167833856556387,
"grad_norm": 0.3200671672821045,
"learning_rate": 7.979446964302856e-05,
"loss": 1.6625,
"step": 214
},
{
"epoch": 0.051919826128954356,
"grad_norm": 0.33359915018081665,
"learning_rate": 7.979132253998442e-05,
"loss": 1.9556,
"step": 215
},
{
"epoch": 0.05216131369234484,
"grad_norm": 0.3339202404022217,
"learning_rate": 7.978815158827862e-05,
"loss": 1.7216,
"step": 216
},
{
"epoch": 0.05240280125573533,
"grad_norm": 0.3254282772541046,
"learning_rate": 7.978495678981165e-05,
"loss": 1.7696,
"step": 217
},
{
"epoch": 0.05264428881912581,
"grad_norm": 0.3372923731803894,
"learning_rate": 7.978173814649828e-05,
"loss": 1.837,
"step": 218
},
{
"epoch": 0.0528857763825163,
"grad_norm": 0.32411250472068787,
"learning_rate": 7.977849566026761e-05,
"loss": 1.8982,
"step": 219
},
{
"epoch": 0.053127263945906784,
"grad_norm": 0.31956303119659424,
"learning_rate": 7.977522933306298e-05,
"loss": 1.884,
"step": 220
},
{
"epoch": 0.05336875150929727,
"grad_norm": 0.3496444821357727,
"learning_rate": 7.977193916684204e-05,
"loss": 1.9066,
"step": 221
},
{
"epoch": 0.053610239072687756,
"grad_norm": 0.29580965638160706,
"learning_rate": 7.976862516357675e-05,
"loss": 1.6975,
"step": 222
},
{
"epoch": 0.05385172663607824,
"grad_norm": 0.30984580516815186,
"learning_rate": 7.976528732525332e-05,
"loss": 1.8103,
"step": 223
},
{
"epoch": 0.05409321419946873,
"grad_norm": 0.33822616934776306,
"learning_rate": 7.976192565387225e-05,
"loss": 1.8781,
"step": 224
},
{
"epoch": 0.05433470176285921,
"grad_norm": 0.32609352469444275,
"learning_rate": 7.975854015144834e-05,
"loss": 1.8569,
"step": 225
},
{
"epoch": 0.0545761893262497,
"grad_norm": 0.33209675550460815,
"learning_rate": 7.975513082001069e-05,
"loss": 1.9403,
"step": 226
},
{
"epoch": 0.054817676889640184,
"grad_norm": 0.3058185577392578,
"learning_rate": 7.975169766160265e-05,
"loss": 1.6912,
"step": 227
},
{
"epoch": 0.05505916445303067,
"grad_norm": 0.35320064425468445,
"learning_rate": 7.974824067828184e-05,
"loss": 1.9151,
"step": 228
},
{
"epoch": 0.055300652016421155,
"grad_norm": 0.336840957403183,
"learning_rate": 7.97447598721202e-05,
"loss": 1.8253,
"step": 229
},
{
"epoch": 0.05554213957981164,
"grad_norm": 0.320771723985672,
"learning_rate": 7.974125524520393e-05,
"loss": 1.7369,
"step": 230
},
{
"epoch": 0.055783627143202126,
"grad_norm": 0.35173293948173523,
"learning_rate": 7.973772679963348e-05,
"loss": 2.0621,
"step": 231
},
{
"epoch": 0.05602511470659261,
"grad_norm": 0.3257352113723755,
"learning_rate": 7.973417453752364e-05,
"loss": 1.8283,
"step": 232
},
{
"epoch": 0.0562666022699831,
"grad_norm": 0.32054367661476135,
"learning_rate": 7.97305984610034e-05,
"loss": 1.8359,
"step": 233
},
{
"epoch": 0.05650808983337358,
"grad_norm": 0.3325577974319458,
"learning_rate": 7.972699857221607e-05,
"loss": 1.9108,
"step": 234
},
{
"epoch": 0.05674957739676407,
"grad_norm": 0.3135945796966553,
"learning_rate": 7.972337487331923e-05,
"loss": 1.6775,
"step": 235
},
{
"epoch": 0.056991064960154554,
"grad_norm": 0.30711257457733154,
"learning_rate": 7.97197273664847e-05,
"loss": 1.7344,
"step": 236
},
{
"epoch": 0.05723255252354504,
"grad_norm": 0.3135779798030853,
"learning_rate": 7.971605605389858e-05,
"loss": 1.84,
"step": 237
},
{
"epoch": 0.057474040086935525,
"grad_norm": 0.29817330837249756,
"learning_rate": 7.971236093776129e-05,
"loss": 1.7427,
"step": 238
},
{
"epoch": 0.05771552765032601,
"grad_norm": 0.3177940845489502,
"learning_rate": 7.970864202028743e-05,
"loss": 1.7154,
"step": 239
},
{
"epoch": 0.057957015213716497,
"grad_norm": 0.3320569396018982,
"learning_rate": 7.970489930370593e-05,
"loss": 1.8771,
"step": 240
},
{
"epoch": 0.05819850277710698,
"grad_norm": 0.32810327410697937,
"learning_rate": 7.970113279025996e-05,
"loss": 1.8912,
"step": 241
},
{
"epoch": 0.05843999034049747,
"grad_norm": 0.3361932635307312,
"learning_rate": 7.969734248220695e-05,
"loss": 1.9356,
"step": 242
},
{
"epoch": 0.058681477903887946,
"grad_norm": 0.34913378953933716,
"learning_rate": 7.969352838181859e-05,
"loss": 1.8365,
"step": 243
},
{
"epoch": 0.05892296546727843,
"grad_norm": 0.3116905689239502,
"learning_rate": 7.968969049138086e-05,
"loss": 1.7415,
"step": 244
},
{
"epoch": 0.05916445303066892,
"grad_norm": 0.2941270172595978,
"learning_rate": 7.968582881319393e-05,
"loss": 1.6864,
"step": 245
},
{
"epoch": 0.0594059405940594,
"grad_norm": 0.32845309376716614,
"learning_rate": 7.968194334957231e-05,
"loss": 1.8652,
"step": 246
},
{
"epoch": 0.05964742815744989,
"grad_norm": 0.34726226329803467,
"learning_rate": 7.967803410284471e-05,
"loss": 1.7913,
"step": 247
},
{
"epoch": 0.059888915720840374,
"grad_norm": 0.3105839490890503,
"learning_rate": 7.967410107535414e-05,
"loss": 1.625,
"step": 248
},
{
"epoch": 0.06013040328423086,
"grad_norm": 0.3217976987361908,
"learning_rate": 7.967014426945778e-05,
"loss": 1.7158,
"step": 249
},
{
"epoch": 0.060371890847621346,
"grad_norm": 0.31204503774642944,
"learning_rate": 7.966616368752715e-05,
"loss": 1.7494,
"step": 250
},
{
"epoch": 0.06061337841101183,
"grad_norm": 0.3445545732975006,
"learning_rate": 7.966215933194797e-05,
"loss": 1.7762,
"step": 251
},
{
"epoch": 0.06085486597440232,
"grad_norm": 0.3073709011077881,
"learning_rate": 7.965813120512024e-05,
"loss": 1.5378,
"step": 252
},
{
"epoch": 0.0610963535377928,
"grad_norm": 0.3341065049171448,
"learning_rate": 7.965407930945818e-05,
"loss": 1.7331,
"step": 253
},
{
"epoch": 0.06133784110118329,
"grad_norm": 0.3325900137424469,
"learning_rate": 7.965000364739028e-05,
"loss": 1.8412,
"step": 254
},
{
"epoch": 0.061579328664573774,
"grad_norm": 0.3155021667480469,
"learning_rate": 7.964590422135923e-05,
"loss": 1.7861,
"step": 255
},
{
"epoch": 0.06182081622796426,
"grad_norm": 0.34470134973526,
"learning_rate": 7.964178103382201e-05,
"loss": 1.8445,
"step": 256
},
{
"epoch": 0.062062303791354745,
"grad_norm": 0.3327556848526001,
"learning_rate": 7.963763408724984e-05,
"loss": 1.7702,
"step": 257
},
{
"epoch": 0.06230379135474523,
"grad_norm": 0.3155532479286194,
"learning_rate": 7.963346338412816e-05,
"loss": 1.7478,
"step": 258
},
{
"epoch": 0.06254527891813572,
"grad_norm": 0.32543814182281494,
"learning_rate": 7.962926892695664e-05,
"loss": 1.8435,
"step": 259
},
{
"epoch": 0.0627867664815262,
"grad_norm": 0.3015563189983368,
"learning_rate": 7.962505071824919e-05,
"loss": 1.7412,
"step": 260
},
{
"epoch": 0.06302825404491669,
"grad_norm": 0.2858722507953644,
"learning_rate": 7.9620808760534e-05,
"loss": 1.5965,
"step": 261
},
{
"epoch": 0.06326974160830717,
"grad_norm": 0.309163361787796,
"learning_rate": 7.961654305635342e-05,
"loss": 1.7705,
"step": 262
},
{
"epoch": 0.06351122917169766,
"grad_norm": 0.31264615058898926,
"learning_rate": 7.96122536082641e-05,
"loss": 1.786,
"step": 263
},
{
"epoch": 0.06375271673508814,
"grad_norm": 0.31055596470832825,
"learning_rate": 7.960794041883688e-05,
"loss": 1.6784,
"step": 264
},
{
"epoch": 0.06399420429847863,
"grad_norm": 0.31669291853904724,
"learning_rate": 7.960360349065684e-05,
"loss": 1.7871,
"step": 265
},
{
"epoch": 0.06423569186186912,
"grad_norm": 0.3654109239578247,
"learning_rate": 7.95992428263233e-05,
"loss": 2.057,
"step": 266
},
{
"epoch": 0.0644771794252596,
"grad_norm": 0.2968808114528656,
"learning_rate": 7.959485842844977e-05,
"loss": 1.7963,
"step": 267
},
{
"epoch": 0.06471866698865009,
"grad_norm": 0.31135043501853943,
"learning_rate": 7.959045029966403e-05,
"loss": 1.7483,
"step": 268
},
{
"epoch": 0.06496015455204057,
"grad_norm": 0.30263540148735046,
"learning_rate": 7.958601844260807e-05,
"loss": 1.5378,
"step": 269
},
{
"epoch": 0.06520164211543106,
"grad_norm": 0.327248215675354,
"learning_rate": 7.958156285993807e-05,
"loss": 1.8316,
"step": 270
},
{
"epoch": 0.06544312967882154,
"grad_norm": 0.3525853455066681,
"learning_rate": 7.957708355432447e-05,
"loss": 2.1472,
"step": 271
},
{
"epoch": 0.06568461724221203,
"grad_norm": 0.3097147047519684,
"learning_rate": 7.957258052845189e-05,
"loss": 1.7649,
"step": 272
},
{
"epoch": 0.06592610480560251,
"grad_norm": 0.3462578058242798,
"learning_rate": 7.956805378501923e-05,
"loss": 1.926,
"step": 273
},
{
"epoch": 0.066167592368993,
"grad_norm": 0.32972514629364014,
"learning_rate": 7.956350332673954e-05,
"loss": 1.8855,
"step": 274
},
{
"epoch": 0.06640907993238349,
"grad_norm": 0.3470173478126526,
"learning_rate": 7.955892915634008e-05,
"loss": 1.8816,
"step": 275
},
{
"epoch": 0.06665056749577397,
"grad_norm": 0.3056792616844177,
"learning_rate": 7.955433127656239e-05,
"loss": 1.7791,
"step": 276
},
{
"epoch": 0.06689205505916446,
"grad_norm": 0.3143889605998993,
"learning_rate": 7.954970969016217e-05,
"loss": 1.7267,
"step": 277
},
{
"epoch": 0.06713354262255494,
"grad_norm": 0.3461814224720001,
"learning_rate": 7.954506439990931e-05,
"loss": 1.8244,
"step": 278
},
{
"epoch": 0.06737503018594543,
"grad_norm": 0.34658658504486084,
"learning_rate": 7.954039540858795e-05,
"loss": 1.888,
"step": 279
},
{
"epoch": 0.06761651774933591,
"grad_norm": 0.323635995388031,
"learning_rate": 7.953570271899644e-05,
"loss": 1.8313,
"step": 280
},
{
"epoch": 0.0678580053127264,
"grad_norm": 0.32019785046577454,
"learning_rate": 7.953098633394728e-05,
"loss": 1.7461,
"step": 281
},
{
"epoch": 0.06809949287611688,
"grad_norm": 0.3277647793292999,
"learning_rate": 7.95262462562672e-05,
"loss": 1.7611,
"step": 282
},
{
"epoch": 0.06834098043950737,
"grad_norm": 0.31137654185295105,
"learning_rate": 7.952148248879718e-05,
"loss": 1.7579,
"step": 283
},
{
"epoch": 0.06858246800289786,
"grad_norm": 0.3207230269908905,
"learning_rate": 7.951669503439232e-05,
"loss": 1.7806,
"step": 284
},
{
"epoch": 0.06882395556628834,
"grad_norm": 0.31498652696609497,
"learning_rate": 7.951188389592193e-05,
"loss": 1.8651,
"step": 285
},
{
"epoch": 0.06906544312967883,
"grad_norm": 0.32896509766578674,
"learning_rate": 7.950704907626956e-05,
"loss": 1.7896,
"step": 286
},
{
"epoch": 0.06930693069306931,
"grad_norm": 0.3297777473926544,
"learning_rate": 7.950219057833293e-05,
"loss": 1.87,
"step": 287
},
{
"epoch": 0.0695484182564598,
"grad_norm": 0.32095208764076233,
"learning_rate": 7.949730840502392e-05,
"loss": 1.8186,
"step": 288
},
{
"epoch": 0.06978990581985028,
"grad_norm": 0.3138609230518341,
"learning_rate": 7.949240255926867e-05,
"loss": 1.7104,
"step": 289
},
{
"epoch": 0.07003139338324077,
"grad_norm": 0.30844905972480774,
"learning_rate": 7.948747304400743e-05,
"loss": 1.7806,
"step": 290
},
{
"epoch": 0.07027288094663126,
"grad_norm": 0.3149530589580536,
"learning_rate": 7.948251986219468e-05,
"loss": 1.8081,
"step": 291
},
{
"epoch": 0.07051436851002173,
"grad_norm": 0.3314594328403473,
"learning_rate": 7.947754301679909e-05,
"loss": 1.8093,
"step": 292
},
{
"epoch": 0.07075585607341221,
"grad_norm": 0.32003554701805115,
"learning_rate": 7.947254251080348e-05,
"loss": 1.8002,
"step": 293
},
{
"epoch": 0.0709973436368027,
"grad_norm": 0.3048597574234009,
"learning_rate": 7.946751834720488e-05,
"loss": 1.8229,
"step": 294
},
{
"epoch": 0.07123883120019318,
"grad_norm": 0.3036291301250458,
"learning_rate": 7.946247052901449e-05,
"loss": 1.8471,
"step": 295
},
{
"epoch": 0.07148031876358367,
"grad_norm": 0.3238702118396759,
"learning_rate": 7.945739905925768e-05,
"loss": 1.7944,
"step": 296
},
{
"epoch": 0.07172180632697416,
"grad_norm": 0.31713131070137024,
"learning_rate": 7.945230394097399e-05,
"loss": 1.8629,
"step": 297
},
{
"epoch": 0.07196329389036464,
"grad_norm": 0.33282196521759033,
"learning_rate": 7.944718517721719e-05,
"loss": 1.8295,
"step": 298
},
{
"epoch": 0.07220478145375513,
"grad_norm": 0.3299509584903717,
"learning_rate": 7.944204277105512e-05,
"loss": 1.8887,
"step": 299
},
{
"epoch": 0.07244626901714561,
"grad_norm": 0.32252463698387146,
"learning_rate": 7.943687672556989e-05,
"loss": 1.9744,
"step": 300
},
{
"epoch": 0.0726877565805361,
"grad_norm": 0.31342577934265137,
"learning_rate": 7.943168704385771e-05,
"loss": 1.8915,
"step": 301
},
{
"epoch": 0.07292924414392658,
"grad_norm": 0.31736376881599426,
"learning_rate": 7.942647372902898e-05,
"loss": 1.6628,
"step": 302
},
{
"epoch": 0.07317073170731707,
"grad_norm": 0.3148774206638336,
"learning_rate": 7.942123678420829e-05,
"loss": 1.9219,
"step": 303
},
{
"epoch": 0.07341221927070755,
"grad_norm": 0.31064704060554504,
"learning_rate": 7.941597621253434e-05,
"loss": 1.6907,
"step": 304
},
{
"epoch": 0.07365370683409804,
"grad_norm": 0.34153732657432556,
"learning_rate": 7.941069201716003e-05,
"loss": 1.8361,
"step": 305
},
{
"epoch": 0.07389519439748853,
"grad_norm": 0.3452036380767822,
"learning_rate": 7.94053842012524e-05,
"loss": 1.9958,
"step": 306
},
{
"epoch": 0.07413668196087901,
"grad_norm": 0.3184818625450134,
"learning_rate": 7.940005276799267e-05,
"loss": 1.8116,
"step": 307
},
{
"epoch": 0.0743781695242695,
"grad_norm": 0.3384685516357422,
"learning_rate": 7.93946977205762e-05,
"loss": 1.8963,
"step": 308
},
{
"epoch": 0.07461965708765998,
"grad_norm": 0.31626102328300476,
"learning_rate": 7.938931906221246e-05,
"loss": 1.7312,
"step": 309
},
{
"epoch": 0.07486114465105047,
"grad_norm": 0.3364972472190857,
"learning_rate": 7.938391679612515e-05,
"loss": 1.9645,
"step": 310
},
{
"epoch": 0.07510263221444095,
"grad_norm": 0.31800857186317444,
"learning_rate": 7.93784909255521e-05,
"loss": 1.873,
"step": 311
},
{
"epoch": 0.07534411977783144,
"grad_norm": 0.2949671745300293,
"learning_rate": 7.937304145374522e-05,
"loss": 1.7794,
"step": 312
},
{
"epoch": 0.07558560734122192,
"grad_norm": 0.3183116912841797,
"learning_rate": 7.936756838397064e-05,
"loss": 1.9644,
"step": 313
},
{
"epoch": 0.07582709490461241,
"grad_norm": 0.32806089520454407,
"learning_rate": 7.93620717195086e-05,
"loss": 1.8161,
"step": 314
},
{
"epoch": 0.0760685824680029,
"grad_norm": 0.3097519874572754,
"learning_rate": 7.935655146365353e-05,
"loss": 1.8672,
"step": 315
},
{
"epoch": 0.07631007003139338,
"grad_norm": 0.3398526608943939,
"learning_rate": 7.935100761971388e-05,
"loss": 2.0628,
"step": 316
},
{
"epoch": 0.07655155759478387,
"grad_norm": 0.2980629503726959,
"learning_rate": 7.934544019101238e-05,
"loss": 1.7722,
"step": 317
},
{
"epoch": 0.07679304515817435,
"grad_norm": 0.33271175622940063,
"learning_rate": 7.93398491808858e-05,
"loss": 1.884,
"step": 318
},
{
"epoch": 0.07703453272156484,
"grad_norm": 0.3190302550792694,
"learning_rate": 7.933423459268509e-05,
"loss": 1.671,
"step": 319
},
{
"epoch": 0.07727602028495532,
"grad_norm": 0.309345006942749,
"learning_rate": 7.932859642977532e-05,
"loss": 1.7244,
"step": 320
},
{
"epoch": 0.07751750784834581,
"grad_norm": 0.3233974575996399,
"learning_rate": 7.932293469553566e-05,
"loss": 1.852,
"step": 321
},
{
"epoch": 0.0777589954117363,
"grad_norm": 0.301312118768692,
"learning_rate": 7.931724939335945e-05,
"loss": 1.7854,
"step": 322
},
{
"epoch": 0.07800048297512678,
"grad_norm": 0.33955588936805725,
"learning_rate": 7.931154052665413e-05,
"loss": 2.0226,
"step": 323
},
{
"epoch": 0.07824197053851727,
"grad_norm": 0.3273119330406189,
"learning_rate": 7.930580809884129e-05,
"loss": 1.8961,
"step": 324
},
{
"epoch": 0.07848345810190775,
"grad_norm": 0.30406197905540466,
"learning_rate": 7.930005211335659e-05,
"loss": 1.7842,
"step": 325
},
{
"epoch": 0.07872494566529824,
"grad_norm": 0.30615532398223877,
"learning_rate": 7.929427257364987e-05,
"loss": 1.6904,
"step": 326
},
{
"epoch": 0.07896643322868872,
"grad_norm": 0.30859431624412537,
"learning_rate": 7.928846948318504e-05,
"loss": 1.736,
"step": 327
},
{
"epoch": 0.07920792079207921,
"grad_norm": 0.3163640797138214,
"learning_rate": 7.928264284544015e-05,
"loss": 1.7944,
"step": 328
},
{
"epoch": 0.0794494083554697,
"grad_norm": 0.3048076629638672,
"learning_rate": 7.927679266390735e-05,
"loss": 1.8136,
"step": 329
},
{
"epoch": 0.07969089591886018,
"grad_norm": 0.30701613426208496,
"learning_rate": 7.927091894209293e-05,
"loss": 1.7733,
"step": 330
},
{
"epoch": 0.07993238348225067,
"grad_norm": 0.3330737054347992,
"learning_rate": 7.926502168351724e-05,
"loss": 1.7777,
"step": 331
},
{
"epoch": 0.08017387104564115,
"grad_norm": 0.3272298276424408,
"learning_rate": 7.925910089171478e-05,
"loss": 1.778,
"step": 332
},
{
"epoch": 0.08041535860903164,
"grad_norm": 0.3150383234024048,
"learning_rate": 7.925315657023412e-05,
"loss": 1.7796,
"step": 333
},
{
"epoch": 0.08065684617242212,
"grad_norm": 0.2980138659477234,
"learning_rate": 7.924718872263795e-05,
"loss": 1.6073,
"step": 334
},
{
"epoch": 0.08089833373581261,
"grad_norm": 0.4316518008708954,
"learning_rate": 7.924119735250307e-05,
"loss": 2.2031,
"step": 335
},
{
"epoch": 0.0811398212992031,
"grad_norm": 0.31057435274124146,
"learning_rate": 7.923518246342037e-05,
"loss": 1.6824,
"step": 336
},
{
"epoch": 0.08138130886259358,
"grad_norm": 0.3399696946144104,
"learning_rate": 7.922914405899482e-05,
"loss": 1.9264,
"step": 337
},
{
"epoch": 0.08162279642598406,
"grad_norm": 0.3247483968734741,
"learning_rate": 7.922308214284551e-05,
"loss": 1.8827,
"step": 338
},
{
"epoch": 0.08186428398937455,
"grad_norm": 0.31430065631866455,
"learning_rate": 7.921699671860561e-05,
"loss": 1.8006,
"step": 339
},
{
"epoch": 0.08210577155276504,
"grad_norm": 0.31440189480781555,
"learning_rate": 7.921088778992236e-05,
"loss": 1.8218,
"step": 340
},
{
"epoch": 0.08234725911615552,
"grad_norm": 0.29964831471443176,
"learning_rate": 7.920475536045711e-05,
"loss": 1.7306,
"step": 341
},
{
"epoch": 0.08258874667954601,
"grad_norm": 0.3098861575126648,
"learning_rate": 7.919859943388531e-05,
"loss": 1.8838,
"step": 342
},
{
"epoch": 0.08283023424293649,
"grad_norm": 0.3180960714817047,
"learning_rate": 7.919242001389645e-05,
"loss": 1.953,
"step": 343
},
{
"epoch": 0.08307172180632698,
"grad_norm": 0.31091493368148804,
"learning_rate": 7.918621710419414e-05,
"loss": 1.7183,
"step": 344
},
{
"epoch": 0.08331320936971746,
"grad_norm": 0.3297421932220459,
"learning_rate": 7.917999070849606e-05,
"loss": 1.966,
"step": 345
},
{
"epoch": 0.08355469693310795,
"grad_norm": 0.33071455359458923,
"learning_rate": 7.917374083053392e-05,
"loss": 1.8315,
"step": 346
},
{
"epoch": 0.08379618449649844,
"grad_norm": 0.31250739097595215,
"learning_rate": 7.916746747405358e-05,
"loss": 1.6587,
"step": 347
},
{
"epoch": 0.08403767205988892,
"grad_norm": 0.3179730176925659,
"learning_rate": 7.916117064281491e-05,
"loss": 1.9032,
"step": 348
},
{
"epoch": 0.0842791596232794,
"grad_norm": 0.3075062036514282,
"learning_rate": 7.915485034059191e-05,
"loss": 1.703,
"step": 349
},
{
"epoch": 0.08452064718666989,
"grad_norm": 0.3239034414291382,
"learning_rate": 7.914850657117255e-05,
"loss": 1.9085,
"step": 350
},
{
"epoch": 0.08476213475006038,
"grad_norm": 0.3100548982620239,
"learning_rate": 7.914213933835899e-05,
"loss": 1.91,
"step": 351
},
{
"epoch": 0.08500362231345086,
"grad_norm": 0.40014979243278503,
"learning_rate": 7.913574864596733e-05,
"loss": 1.7173,
"step": 352
},
{
"epoch": 0.08524510987684135,
"grad_norm": 0.3187917470932007,
"learning_rate": 7.912933449782784e-05,
"loss": 1.8536,
"step": 353
},
{
"epoch": 0.08548659744023183,
"grad_norm": 0.3200077712535858,
"learning_rate": 7.912289689778477e-05,
"loss": 1.8253,
"step": 354
},
{
"epoch": 0.08572808500362232,
"grad_norm": 0.2999676465988159,
"learning_rate": 7.911643584969644e-05,
"loss": 1.5448,
"step": 355
},
{
"epoch": 0.08596957256701279,
"grad_norm": 0.3196452558040619,
"learning_rate": 7.910995135743527e-05,
"loss": 1.7994,
"step": 356
},
{
"epoch": 0.08621106013040328,
"grad_norm": 0.32246023416519165,
"learning_rate": 7.910344342488767e-05,
"loss": 1.8654,
"step": 357
},
{
"epoch": 0.08645254769379376,
"grad_norm": 0.3140832185745239,
"learning_rate": 7.909691205595415e-05,
"loss": 1.7172,
"step": 358
},
{
"epoch": 0.08669403525718425,
"grad_norm": 0.3014783561229706,
"learning_rate": 7.909035725454922e-05,
"loss": 1.8307,
"step": 359
},
{
"epoch": 0.08693552282057473,
"grad_norm": 0.31697165966033936,
"learning_rate": 7.908377902460145e-05,
"loss": 1.8369,
"step": 360
},
{
"epoch": 0.08717701038396522,
"grad_norm": 0.34776023030281067,
"learning_rate": 7.907717737005347e-05,
"loss": 1.7673,
"step": 361
},
{
"epoch": 0.0874184979473557,
"grad_norm": 0.30561959743499756,
"learning_rate": 7.907055229486194e-05,
"loss": 1.7124,
"step": 362
},
{
"epoch": 0.08765998551074619,
"grad_norm": 0.31223785877227783,
"learning_rate": 7.906390380299757e-05,
"loss": 1.8257,
"step": 363
},
{
"epoch": 0.08790147307413668,
"grad_norm": 0.31563735008239746,
"learning_rate": 7.905723189844505e-05,
"loss": 1.6304,
"step": 364
},
{
"epoch": 0.08814296063752716,
"grad_norm": 0.3267379105091095,
"learning_rate": 7.905053658520317e-05,
"loss": 1.8192,
"step": 365
},
{
"epoch": 0.08838444820091765,
"grad_norm": 0.3055742084980011,
"learning_rate": 7.90438178672847e-05,
"loss": 1.7856,
"step": 366
},
{
"epoch": 0.08862593576430813,
"grad_norm": 0.3425109088420868,
"learning_rate": 7.90370757487165e-05,
"loss": 1.8594,
"step": 367
},
{
"epoch": 0.08886742332769862,
"grad_norm": 0.34041139483451843,
"learning_rate": 7.903031023353937e-05,
"loss": 1.8386,
"step": 368
},
{
"epoch": 0.0891089108910891,
"grad_norm": 0.3045822083950043,
"learning_rate": 7.902352132580818e-05,
"loss": 1.7817,
"step": 369
},
{
"epoch": 0.08935039845447959,
"grad_norm": 0.33832845091819763,
"learning_rate": 7.901670902959184e-05,
"loss": 1.891,
"step": 370
},
{
"epoch": 0.08959188601787008,
"grad_norm": 0.31787779927253723,
"learning_rate": 7.900987334897323e-05,
"loss": 1.8206,
"step": 371
},
{
"epoch": 0.08983337358126056,
"grad_norm": 0.3053485155105591,
"learning_rate": 7.900301428804929e-05,
"loss": 1.8119,
"step": 372
},
{
"epoch": 0.09007486114465105,
"grad_norm": 0.3189673125743866,
"learning_rate": 7.899613185093094e-05,
"loss": 1.7181,
"step": 373
},
{
"epoch": 0.09031634870804153,
"grad_norm": 0.330003947019577,
"learning_rate": 7.898922604174312e-05,
"loss": 1.7952,
"step": 374
},
{
"epoch": 0.09055783627143202,
"grad_norm": 0.32323578000068665,
"learning_rate": 7.89822968646248e-05,
"loss": 1.8331,
"step": 375
},
{
"epoch": 0.0907993238348225,
"grad_norm": 0.3234061896800995,
"learning_rate": 7.897534432372891e-05,
"loss": 1.8201,
"step": 376
},
{
"epoch": 0.09104081139821299,
"grad_norm": 0.3311329185962677,
"learning_rate": 7.896836842322241e-05,
"loss": 1.8964,
"step": 377
},
{
"epoch": 0.09128229896160348,
"grad_norm": 0.33288565278053284,
"learning_rate": 7.896136916728628e-05,
"loss": 1.7157,
"step": 378
},
{
"epoch": 0.09152378652499396,
"grad_norm": 0.2955407202243805,
"learning_rate": 7.895434656011546e-05,
"loss": 1.7627,
"step": 379
},
{
"epoch": 0.09176527408838445,
"grad_norm": 0.32634636759757996,
"learning_rate": 7.894730060591892e-05,
"loss": 1.9303,
"step": 380
},
{
"epoch": 0.09200676165177493,
"grad_norm": 0.3033986985683441,
"learning_rate": 7.894023130891958e-05,
"loss": 1.6711,
"step": 381
},
{
"epoch": 0.09224824921516542,
"grad_norm": 0.31847289204597473,
"learning_rate": 7.893313867335439e-05,
"loss": 1.7684,
"step": 382
},
{
"epoch": 0.0924897367785559,
"grad_norm": 0.3253558874130249,
"learning_rate": 7.892602270347427e-05,
"loss": 1.8255,
"step": 383
},
{
"epoch": 0.09273122434194639,
"grad_norm": 0.3166964054107666,
"learning_rate": 7.891888340354413e-05,
"loss": 1.7866,
"step": 384
},
{
"epoch": 0.09297271190533687,
"grad_norm": 0.3175016939640045,
"learning_rate": 7.891172077784288e-05,
"loss": 1.8906,
"step": 385
},
{
"epoch": 0.09321419946872736,
"grad_norm": 0.31322944164276123,
"learning_rate": 7.890453483066337e-05,
"loss": 1.8335,
"step": 386
},
{
"epoch": 0.09345568703211785,
"grad_norm": 0.324131041765213,
"learning_rate": 7.889732556631243e-05,
"loss": 1.8105,
"step": 387
},
{
"epoch": 0.09369717459550833,
"grad_norm": 0.3010997772216797,
"learning_rate": 7.889009298911093e-05,
"loss": 1.654,
"step": 388
},
{
"epoch": 0.09393866215889882,
"grad_norm": 0.3338373005390167,
"learning_rate": 7.888283710339364e-05,
"loss": 1.9387,
"step": 389
},
{
"epoch": 0.0941801497222893,
"grad_norm": 0.32794541120529175,
"learning_rate": 7.887555791350932e-05,
"loss": 1.7921,
"step": 390
},
{
"epoch": 0.09442163728567979,
"grad_norm": 0.30111920833587646,
"learning_rate": 7.886825542382073e-05,
"loss": 1.7964,
"step": 391
},
{
"epoch": 0.09466312484907027,
"grad_norm": 0.3243824243545532,
"learning_rate": 7.886092963870453e-05,
"loss": 1.8344,
"step": 392
},
{
"epoch": 0.09490461241246076,
"grad_norm": 0.32472896575927734,
"learning_rate": 7.885358056255141e-05,
"loss": 1.83,
"step": 393
},
{
"epoch": 0.09514609997585124,
"grad_norm": 0.3036370277404785,
"learning_rate": 7.884620819976599e-05,
"loss": 1.7287,
"step": 394
},
{
"epoch": 0.09538758753924173,
"grad_norm": 0.29689764976501465,
"learning_rate": 7.883881255476683e-05,
"loss": 1.6488,
"step": 395
},
{
"epoch": 0.09562907510263222,
"grad_norm": 0.3028903901576996,
"learning_rate": 7.883139363198647e-05,
"loss": 1.7084,
"step": 396
},
{
"epoch": 0.0958705626660227,
"grad_norm": 0.3292778730392456,
"learning_rate": 7.882395143587139e-05,
"loss": 1.6758,
"step": 397
},
{
"epoch": 0.09611205022941319,
"grad_norm": 0.31232085824012756,
"learning_rate": 7.8816485970882e-05,
"loss": 1.717,
"step": 398
},
{
"epoch": 0.09635353779280367,
"grad_norm": 0.29923149943351746,
"learning_rate": 7.880899724149272e-05,
"loss": 1.6746,
"step": 399
},
{
"epoch": 0.09659502535619416,
"grad_norm": 0.3306083083152771,
"learning_rate": 7.880148525219183e-05,
"loss": 1.8822,
"step": 400
},
{
"epoch": 0.09683651291958464,
"grad_norm": 0.3653043508529663,
"learning_rate": 7.879395000748162e-05,
"loss": 1.8299,
"step": 401
},
{
"epoch": 0.09707800048297513,
"grad_norm": 0.3056018054485321,
"learning_rate": 7.878639151187826e-05,
"loss": 1.7678,
"step": 402
},
{
"epoch": 0.09731948804636562,
"grad_norm": 0.3255383372306824,
"learning_rate": 7.87788097699119e-05,
"loss": 1.7895,
"step": 403
},
{
"epoch": 0.0975609756097561,
"grad_norm": 0.33858051896095276,
"learning_rate": 7.87712047861266e-05,
"loss": 1.7134,
"step": 404
},
{
"epoch": 0.09780246317314659,
"grad_norm": 0.31411212682724,
"learning_rate": 7.876357656508037e-05,
"loss": 1.8154,
"step": 405
},
{
"epoch": 0.09804395073653707,
"grad_norm": 0.3814811408519745,
"learning_rate": 7.87559251113451e-05,
"loss": 2.0315,
"step": 406
},
{
"epoch": 0.09828543829992756,
"grad_norm": 0.3188989758491516,
"learning_rate": 7.874825042950668e-05,
"loss": 1.8787,
"step": 407
},
{
"epoch": 0.09852692586331804,
"grad_norm": 0.31648024916648865,
"learning_rate": 7.874055252416486e-05,
"loss": 1.8118,
"step": 408
},
{
"epoch": 0.09876841342670853,
"grad_norm": 0.3221266269683838,
"learning_rate": 7.87328313999333e-05,
"loss": 1.7063,
"step": 409
},
{
"epoch": 0.09900990099009901,
"grad_norm": 0.3123248219490051,
"learning_rate": 7.872508706143966e-05,
"loss": 1.7513,
"step": 410
},
{
"epoch": 0.0992513885534895,
"grad_norm": 0.3216148614883423,
"learning_rate": 7.871731951332541e-05,
"loss": 1.7605,
"step": 411
},
{
"epoch": 0.09949287611687999,
"grad_norm": 0.32707303762435913,
"learning_rate": 7.8709528760246e-05,
"loss": 1.9001,
"step": 412
},
{
"epoch": 0.09973436368027047,
"grad_norm": 0.30996280908584595,
"learning_rate": 7.870171480687076e-05,
"loss": 1.7069,
"step": 413
},
{
"epoch": 0.09997585124366096,
"grad_norm": 0.32727497816085815,
"learning_rate": 7.869387765788293e-05,
"loss": 1.9334,
"step": 414
},
{
"epoch": 0.10021733880705144,
"grad_norm": 0.33509087562561035,
"learning_rate": 7.868601731797966e-05,
"loss": 1.9259,
"step": 415
},
{
"epoch": 0.10045882637044193,
"grad_norm": 0.31665563583374023,
"learning_rate": 7.867813379187197e-05,
"loss": 1.8213,
"step": 416
},
{
"epoch": 0.10070031393383241,
"grad_norm": 0.3228408098220825,
"learning_rate": 7.867022708428482e-05,
"loss": 1.984,
"step": 417
},
{
"epoch": 0.1009418014972229,
"grad_norm": 0.32034409046173096,
"learning_rate": 7.866229719995705e-05,
"loss": 1.6622,
"step": 418
},
{
"epoch": 0.10118328906061339,
"grad_norm": 0.3438382148742676,
"learning_rate": 7.865434414364136e-05,
"loss": 1.888,
"step": 419
},
{
"epoch": 0.10142477662400387,
"grad_norm": 0.3029784560203552,
"learning_rate": 7.864636792010437e-05,
"loss": 1.7853,
"step": 420
},
{
"epoch": 0.10166626418739434,
"grad_norm": 0.30743077397346497,
"learning_rate": 7.863836853412656e-05,
"loss": 1.8469,
"step": 421
},
{
"epoch": 0.10190775175078483,
"grad_norm": 0.2992570102214813,
"learning_rate": 7.863034599050235e-05,
"loss": 1.6541,
"step": 422
},
{
"epoch": 0.10214923931417531,
"grad_norm": 0.32089993357658386,
"learning_rate": 7.862230029403995e-05,
"loss": 1.8598,
"step": 423
},
{
"epoch": 0.1023907268775658,
"grad_norm": 0.316001832485199,
"learning_rate": 7.861423144956152e-05,
"loss": 1.7655,
"step": 424
},
{
"epoch": 0.10263221444095628,
"grad_norm": 0.2942180037498474,
"learning_rate": 7.860613946190306e-05,
"loss": 1.6929,
"step": 425
},
{
"epoch": 0.10287370200434677,
"grad_norm": 0.3150692582130432,
"learning_rate": 7.859802433591446e-05,
"loss": 1.8213,
"step": 426
},
{
"epoch": 0.10311518956773726,
"grad_norm": 0.32416391372680664,
"learning_rate": 7.858988607645945e-05,
"loss": 1.7896,
"step": 427
},
{
"epoch": 0.10335667713112774,
"grad_norm": 0.3014080822467804,
"learning_rate": 7.858172468841565e-05,
"loss": 1.785,
"step": 428
},
{
"epoch": 0.10359816469451823,
"grad_norm": 0.2929125428199768,
"learning_rate": 7.857354017667453e-05,
"loss": 1.7752,
"step": 429
},
{
"epoch": 0.10383965225790871,
"grad_norm": 0.2827056348323822,
"learning_rate": 7.856533254614143e-05,
"loss": 1.5381,
"step": 430
},
{
"epoch": 0.1040811398212992,
"grad_norm": 0.32544657588005066,
"learning_rate": 7.855710180173554e-05,
"loss": 1.8389,
"step": 431
},
{
"epoch": 0.10432262738468968,
"grad_norm": 0.32522135972976685,
"learning_rate": 7.854884794838987e-05,
"loss": 1.8001,
"step": 432
},
{
"epoch": 0.10456411494808017,
"grad_norm": 0.3139720559120178,
"learning_rate": 7.854057099105135e-05,
"loss": 1.9425,
"step": 433
},
{
"epoch": 0.10480560251147066,
"grad_norm": 0.31643447279930115,
"learning_rate": 7.85322709346807e-05,
"loss": 1.8469,
"step": 434
},
{
"epoch": 0.10504709007486114,
"grad_norm": 0.3155469000339508,
"learning_rate": 7.852394778425251e-05,
"loss": 1.7791,
"step": 435
},
{
"epoch": 0.10528857763825163,
"grad_norm": 0.3019671142101288,
"learning_rate": 7.851560154475519e-05,
"loss": 1.7212,
"step": 436
},
{
"epoch": 0.10553006520164211,
"grad_norm": 0.2972007989883423,
"learning_rate": 7.850723222119102e-05,
"loss": 1.6482,
"step": 437
},
{
"epoch": 0.1057715527650326,
"grad_norm": 0.31408366560935974,
"learning_rate": 7.84988398185761e-05,
"loss": 1.6854,
"step": 438
},
{
"epoch": 0.10601304032842308,
"grad_norm": 0.3040061295032501,
"learning_rate": 7.849042434194033e-05,
"loss": 1.541,
"step": 439
},
{
"epoch": 0.10625452789181357,
"grad_norm": 0.3008679151535034,
"learning_rate": 7.848198579632751e-05,
"loss": 1.6453,
"step": 440
},
{
"epoch": 0.10649601545520405,
"grad_norm": 0.3023047149181366,
"learning_rate": 7.847352418679519e-05,
"loss": 1.7668,
"step": 441
},
{
"epoch": 0.10673750301859454,
"grad_norm": 0.3142707347869873,
"learning_rate": 7.846503951841481e-05,
"loss": 1.8614,
"step": 442
},
{
"epoch": 0.10697899058198503,
"grad_norm": 0.3729107677936554,
"learning_rate": 7.845653179627158e-05,
"loss": 1.9223,
"step": 443
},
{
"epoch": 0.10722047814537551,
"grad_norm": 0.30760377645492554,
"learning_rate": 7.844800102546455e-05,
"loss": 1.8463,
"step": 444
},
{
"epoch": 0.107461965708766,
"grad_norm": 0.30554649233818054,
"learning_rate": 7.843944721110657e-05,
"loss": 1.8305,
"step": 445
},
{
"epoch": 0.10770345327215648,
"grad_norm": 0.33122071623802185,
"learning_rate": 7.843087035832433e-05,
"loss": 1.8518,
"step": 446
},
{
"epoch": 0.10794494083554697,
"grad_norm": 0.3029637336730957,
"learning_rate": 7.842227047225831e-05,
"loss": 1.7256,
"step": 447
},
{
"epoch": 0.10818642839893745,
"grad_norm": 0.3378790616989136,
"learning_rate": 7.841364755806276e-05,
"loss": 1.7585,
"step": 448
},
{
"epoch": 0.10842791596232794,
"grad_norm": 0.3104263246059418,
"learning_rate": 7.840500162090581e-05,
"loss": 1.8183,
"step": 449
},
{
"epoch": 0.10866940352571842,
"grad_norm": 0.31785666942596436,
"learning_rate": 7.839633266596932e-05,
"loss": 1.766,
"step": 450
},
{
"epoch": 0.10891089108910891,
"grad_norm": 0.306466668844223,
"learning_rate": 7.838764069844896e-05,
"loss": 1.8549,
"step": 451
},
{
"epoch": 0.1091523786524994,
"grad_norm": 0.32088086009025574,
"learning_rate": 7.837892572355422e-05,
"loss": 1.9489,
"step": 452
},
{
"epoch": 0.10939386621588988,
"grad_norm": 0.3011303246021271,
"learning_rate": 7.837018774650837e-05,
"loss": 1.6997,
"step": 453
},
{
"epoch": 0.10963535377928037,
"grad_norm": 0.3179236054420471,
"learning_rate": 7.836142677254844e-05,
"loss": 1.7926,
"step": 454
},
{
"epoch": 0.10987684134267085,
"grad_norm": 0.3137897551059723,
"learning_rate": 7.835264280692527e-05,
"loss": 1.8318,
"step": 455
},
{
"epoch": 0.11011832890606134,
"grad_norm": 0.3057895004749298,
"learning_rate": 7.834383585490347e-05,
"loss": 1.8321,
"step": 456
},
{
"epoch": 0.11035981646945182,
"grad_norm": 0.30022135376930237,
"learning_rate": 7.83350059217614e-05,
"loss": 1.7029,
"step": 457
},
{
"epoch": 0.11060130403284231,
"grad_norm": 0.32522615790367126,
"learning_rate": 7.832615301279128e-05,
"loss": 1.8882,
"step": 458
},
{
"epoch": 0.1108427915962328,
"grad_norm": 0.32540178298950195,
"learning_rate": 7.831727713329899e-05,
"loss": 1.8073,
"step": 459
},
{
"epoch": 0.11108427915962328,
"grad_norm": 0.3010394871234894,
"learning_rate": 7.830837828860425e-05,
"loss": 1.743,
"step": 460
},
{
"epoch": 0.11132576672301377,
"grad_norm": 0.30483198165893555,
"learning_rate": 7.829945648404051e-05,
"loss": 1.7134,
"step": 461
},
{
"epoch": 0.11156725428640425,
"grad_norm": 0.3163911700248718,
"learning_rate": 7.829051172495501e-05,
"loss": 1.7856,
"step": 462
},
{
"epoch": 0.11180874184979474,
"grad_norm": 0.31254616379737854,
"learning_rate": 7.828154401670873e-05,
"loss": 1.8231,
"step": 463
},
{
"epoch": 0.11205022941318522,
"grad_norm": 0.29678279161453247,
"learning_rate": 7.827255336467639e-05,
"loss": 1.7363,
"step": 464
},
{
"epoch": 0.11229171697657571,
"grad_norm": 0.3344740569591522,
"learning_rate": 7.826353977424648e-05,
"loss": 1.9809,
"step": 465
},
{
"epoch": 0.1125332045399662,
"grad_norm": 0.3076111078262329,
"learning_rate": 7.825450325082125e-05,
"loss": 1.7802,
"step": 466
},
{
"epoch": 0.11277469210335668,
"grad_norm": 0.3477588891983032,
"learning_rate": 7.824544379981667e-05,
"loss": 1.7485,
"step": 467
},
{
"epoch": 0.11301617966674717,
"grad_norm": 0.3137153685092926,
"learning_rate": 7.823636142666246e-05,
"loss": 1.843,
"step": 468
},
{
"epoch": 0.11325766723013765,
"grad_norm": 0.3323186933994293,
"learning_rate": 7.822725613680208e-05,
"loss": 1.9249,
"step": 469
},
{
"epoch": 0.11349915479352814,
"grad_norm": 0.3301668167114258,
"learning_rate": 7.821812793569272e-05,
"loss": 1.7981,
"step": 470
},
{
"epoch": 0.11374064235691862,
"grad_norm": 0.30615851283073425,
"learning_rate": 7.820897682880532e-05,
"loss": 1.7497,
"step": 471
},
{
"epoch": 0.11398212992030911,
"grad_norm": 0.32018932700157166,
"learning_rate": 7.819980282162453e-05,
"loss": 1.6479,
"step": 472
},
{
"epoch": 0.1142236174836996,
"grad_norm": 0.29449161887168884,
"learning_rate": 7.81906059196487e-05,
"loss": 1.7017,
"step": 473
},
{
"epoch": 0.11446510504709008,
"grad_norm": 0.38252466917037964,
"learning_rate": 7.818138612838998e-05,
"loss": 1.9334,
"step": 474
},
{
"epoch": 0.11470659261048056,
"grad_norm": 0.307882159948349,
"learning_rate": 7.817214345337416e-05,
"loss": 1.7899,
"step": 475
},
{
"epoch": 0.11494808017387105,
"grad_norm": 0.33490505814552307,
"learning_rate": 7.816287790014078e-05,
"loss": 1.8565,
"step": 476
},
{
"epoch": 0.11518956773726154,
"grad_norm": 0.3033045530319214,
"learning_rate": 7.81535894742431e-05,
"loss": 1.7633,
"step": 477
},
{
"epoch": 0.11543105530065202,
"grad_norm": 0.3428524136543274,
"learning_rate": 7.814427818124805e-05,
"loss": 1.9177,
"step": 478
},
{
"epoch": 0.11567254286404251,
"grad_norm": 0.2950695753097534,
"learning_rate": 7.813494402673631e-05,
"loss": 1.7384,
"step": 479
},
{
"epoch": 0.11591403042743299,
"grad_norm": 0.3069492280483246,
"learning_rate": 7.812558701630223e-05,
"loss": 1.8424,
"step": 480
},
{
"epoch": 0.11615551799082348,
"grad_norm": 0.32316040992736816,
"learning_rate": 7.811620715555388e-05,
"loss": 1.8142,
"step": 481
},
{
"epoch": 0.11639700555421396,
"grad_norm": 0.3073161840438843,
"learning_rate": 7.810680445011302e-05,
"loss": 1.707,
"step": 482
},
{
"epoch": 0.11663849311760445,
"grad_norm": 0.3104289770126343,
"learning_rate": 7.80973789056151e-05,
"loss": 1.6462,
"step": 483
},
{
"epoch": 0.11687998068099494,
"grad_norm": 0.3424341082572937,
"learning_rate": 7.808793052770923e-05,
"loss": 1.7575,
"step": 484
},
{
"epoch": 0.11712146824438542,
"grad_norm": 0.3131846785545349,
"learning_rate": 7.807845932205829e-05,
"loss": 1.8376,
"step": 485
},
{
"epoch": 0.11736295580777589,
"grad_norm": 0.3051791489124298,
"learning_rate": 7.806896529433872e-05,
"loss": 1.7343,
"step": 486
},
{
"epoch": 0.11760444337116638,
"grad_norm": 0.33665788173675537,
"learning_rate": 7.805944845024072e-05,
"loss": 2.0152,
"step": 487
},
{
"epoch": 0.11784593093455686,
"grad_norm": 0.31663084030151367,
"learning_rate": 7.804990879546817e-05,
"loss": 1.8096,
"step": 488
},
{
"epoch": 0.11808741849794735,
"grad_norm": 0.28980234265327454,
"learning_rate": 7.804034633573856e-05,
"loss": 1.5561,
"step": 489
},
{
"epoch": 0.11832890606133784,
"grad_norm": 0.3290766775608063,
"learning_rate": 7.803076107678314e-05,
"loss": 1.9711,
"step": 490
},
{
"epoch": 0.11857039362472832,
"grad_norm": 0.3099691867828369,
"learning_rate": 7.802115302434671e-05,
"loss": 1.7569,
"step": 491
},
{
"epoch": 0.1188118811881188,
"grad_norm": 0.2974450886249542,
"learning_rate": 7.801152218418784e-05,
"loss": 1.6721,
"step": 492
},
{
"epoch": 0.11905336875150929,
"grad_norm": 0.30560582876205444,
"learning_rate": 7.800186856207867e-05,
"loss": 1.797,
"step": 493
},
{
"epoch": 0.11929485631489978,
"grad_norm": 0.311374694108963,
"learning_rate": 7.799219216380506e-05,
"loss": 1.8303,
"step": 494
},
{
"epoch": 0.11953634387829026,
"grad_norm": 0.30325567722320557,
"learning_rate": 7.798249299516649e-05,
"loss": 1.6506,
"step": 495
},
{
"epoch": 0.11977783144168075,
"grad_norm": 0.30898353457450867,
"learning_rate": 7.797277106197609e-05,
"loss": 1.7336,
"step": 496
},
{
"epoch": 0.12001931900507123,
"grad_norm": 0.3123491704463959,
"learning_rate": 7.796302637006063e-05,
"loss": 1.8833,
"step": 497
},
{
"epoch": 0.12026080656846172,
"grad_norm": 0.3127240836620331,
"learning_rate": 7.795325892526054e-05,
"loss": 1.8374,
"step": 498
},
{
"epoch": 0.1205022941318522,
"grad_norm": 0.29663944244384766,
"learning_rate": 7.794346873342985e-05,
"loss": 1.7163,
"step": 499
},
{
"epoch": 0.12074378169524269,
"grad_norm": 0.30658090114593506,
"learning_rate": 7.793365580043625e-05,
"loss": 1.6681,
"step": 500
},
{
"epoch": 0.12098526925863318,
"grad_norm": 0.33467593789100647,
"learning_rate": 7.792382013216108e-05,
"loss": 1.9931,
"step": 501
},
{
"epoch": 0.12122675682202366,
"grad_norm": 0.33266401290893555,
"learning_rate": 7.791396173449926e-05,
"loss": 1.9241,
"step": 502
},
{
"epoch": 0.12146824438541415,
"grad_norm": 0.32820233702659607,
"learning_rate": 7.790408061335935e-05,
"loss": 1.804,
"step": 503
},
{
"epoch": 0.12170973194880463,
"grad_norm": 0.3111535608768463,
"learning_rate": 7.789417677466356e-05,
"loss": 1.8708,
"step": 504
},
{
"epoch": 0.12195121951219512,
"grad_norm": 0.3077269196510315,
"learning_rate": 7.788425022434766e-05,
"loss": 1.8725,
"step": 505
},
{
"epoch": 0.1221927070755856,
"grad_norm": 0.31311750411987305,
"learning_rate": 7.787430096836107e-05,
"loss": 1.7439,
"step": 506
},
{
"epoch": 0.12243419463897609,
"grad_norm": 0.3787113428115845,
"learning_rate": 7.786432901266681e-05,
"loss": 1.8704,
"step": 507
},
{
"epoch": 0.12267568220236658,
"grad_norm": 0.30987265706062317,
"learning_rate": 7.785433436324153e-05,
"loss": 1.7655,
"step": 508
},
{
"epoch": 0.12291716976575706,
"grad_norm": 0.327898234128952,
"learning_rate": 7.78443170260754e-05,
"loss": 1.7135,
"step": 509
},
{
"epoch": 0.12315865732914755,
"grad_norm": 0.3035421073436737,
"learning_rate": 7.78342770071723e-05,
"loss": 1.7768,
"step": 510
},
{
"epoch": 0.12340014489253803,
"grad_norm": 0.33335283398628235,
"learning_rate": 7.78242143125496e-05,
"loss": 1.8051,
"step": 511
},
{
"epoch": 0.12364163245592852,
"grad_norm": 0.2996819317340851,
"learning_rate": 7.781412894823837e-05,
"loss": 1.6147,
"step": 512
},
{
"epoch": 0.123883120019319,
"grad_norm": 0.33031609654426575,
"learning_rate": 7.780402092028314e-05,
"loss": 2.0319,
"step": 513
},
{
"epoch": 0.12412460758270949,
"grad_norm": 0.33066943287849426,
"learning_rate": 7.779389023474212e-05,
"loss": 1.7073,
"step": 514
},
{
"epoch": 0.12436609514609998,
"grad_norm": 0.31213587522506714,
"learning_rate": 7.778373689768707e-05,
"loss": 1.8183,
"step": 515
},
{
"epoch": 0.12460758270949046,
"grad_norm": 0.3083159625530243,
"learning_rate": 7.777356091520333e-05,
"loss": 1.6001,
"step": 516
},
{
"epoch": 0.12484907027288095,
"grad_norm": 0.3064626455307007,
"learning_rate": 7.776336229338978e-05,
"loss": 1.691,
"step": 517
},
{
"epoch": 0.12509055783627143,
"grad_norm": 0.3259071707725525,
"learning_rate": 7.775314103835892e-05,
"loss": 1.7561,
"step": 518
},
{
"epoch": 0.12533204539966192,
"grad_norm": 0.3399283289909363,
"learning_rate": 7.774289715623677e-05,
"loss": 1.7772,
"step": 519
},
{
"epoch": 0.1255735329630524,
"grad_norm": 0.29182565212249756,
"learning_rate": 7.773263065316296e-05,
"loss": 1.8109,
"step": 520
},
{
"epoch": 0.1258150205264429,
"grad_norm": 0.30191174149513245,
"learning_rate": 7.772234153529061e-05,
"loss": 1.8194,
"step": 521
},
{
"epoch": 0.12605650808983337,
"grad_norm": 0.3192642629146576,
"learning_rate": 7.771202980878648e-05,
"loss": 1.8612,
"step": 522
},
{
"epoch": 0.12629799565322386,
"grad_norm": 0.35645371675491333,
"learning_rate": 7.770169547983081e-05,
"loss": 1.8897,
"step": 523
},
{
"epoch": 0.12653948321661435,
"grad_norm": 0.2943851053714752,
"learning_rate": 7.769133855461739e-05,
"loss": 1.7457,
"step": 524
},
{
"epoch": 0.12678097078000483,
"grad_norm": 0.30867692828178406,
"learning_rate": 7.768095903935362e-05,
"loss": 1.7291,
"step": 525
},
{
"epoch": 0.12702245834339532,
"grad_norm": 0.315302312374115,
"learning_rate": 7.767055694026037e-05,
"loss": 1.7178,
"step": 526
},
{
"epoch": 0.1272639459067858,
"grad_norm": 0.31131064891815186,
"learning_rate": 7.766013226357204e-05,
"loss": 1.7799,
"step": 527
},
{
"epoch": 0.1275054334701763,
"grad_norm": 0.31616145372390747,
"learning_rate": 7.764968501553663e-05,
"loss": 1.7628,
"step": 528
},
{
"epoch": 0.12774692103356677,
"grad_norm": 0.33065786957740784,
"learning_rate": 7.763921520241561e-05,
"loss": 1.8967,
"step": 529
},
{
"epoch": 0.12798840859695726,
"grad_norm": 0.31441378593444824,
"learning_rate": 7.762872283048401e-05,
"loss": 1.7663,
"step": 530
},
{
"epoch": 0.12822989616034774,
"grad_norm": 0.3112833797931671,
"learning_rate": 7.761820790603032e-05,
"loss": 1.7743,
"step": 531
},
{
"epoch": 0.12847138372373823,
"grad_norm": 0.32388782501220703,
"learning_rate": 7.760767043535665e-05,
"loss": 1.6944,
"step": 532
},
{
"epoch": 0.12871287128712872,
"grad_norm": 0.32488951086997986,
"learning_rate": 7.759711042477852e-05,
"loss": 1.7904,
"step": 533
},
{
"epoch": 0.1289543588505192,
"grad_norm": 0.3164033889770508,
"learning_rate": 7.7586527880625e-05,
"loss": 1.6745,
"step": 534
},
{
"epoch": 0.1291958464139097,
"grad_norm": 0.2963300347328186,
"learning_rate": 7.757592280923868e-05,
"loss": 1.6547,
"step": 535
},
{
"epoch": 0.12943733397730017,
"grad_norm": 0.4110172390937805,
"learning_rate": 7.756529521697564e-05,
"loss": 1.7822,
"step": 536
},
{
"epoch": 0.12967882154069066,
"grad_norm": 0.31125178933143616,
"learning_rate": 7.755464511020546e-05,
"loss": 1.5875,
"step": 537
},
{
"epoch": 0.12992030910408114,
"grad_norm": 0.3192461431026459,
"learning_rate": 7.75439724953112e-05,
"loss": 1.7424,
"step": 538
},
{
"epoch": 0.13016179666747163,
"grad_norm": 0.3069697320461273,
"learning_rate": 7.75332773786894e-05,
"loss": 1.6848,
"step": 539
},
{
"epoch": 0.13040328423086212,
"grad_norm": 0.2977539002895355,
"learning_rate": 7.752255976675016e-05,
"loss": 1.6205,
"step": 540
},
{
"epoch": 0.1306447717942526,
"grad_norm": 0.3145061135292053,
"learning_rate": 7.751181966591695e-05,
"loss": 1.7091,
"step": 541
},
{
"epoch": 0.1308862593576431,
"grad_norm": 0.3141860067844391,
"learning_rate": 7.750105708262682e-05,
"loss": 1.8047,
"step": 542
},
{
"epoch": 0.13112774692103357,
"grad_norm": 0.31395336985588074,
"learning_rate": 7.749027202333023e-05,
"loss": 1.7166,
"step": 543
},
{
"epoch": 0.13136923448442406,
"grad_norm": 0.3210203945636749,
"learning_rate": 7.747946449449115e-05,
"loss": 1.8435,
"step": 544
},
{
"epoch": 0.13161072204781454,
"grad_norm": 0.32427695393562317,
"learning_rate": 7.746863450258698e-05,
"loss": 1.8571,
"step": 545
},
{
"epoch": 0.13185220961120503,
"grad_norm": 0.3196699321269989,
"learning_rate": 7.74577820541086e-05,
"loss": 1.5772,
"step": 546
},
{
"epoch": 0.13209369717459551,
"grad_norm": 0.31027519702911377,
"learning_rate": 7.744690715556039e-05,
"loss": 1.6998,
"step": 547
},
{
"epoch": 0.132335184737986,
"grad_norm": 0.3263092637062073,
"learning_rate": 7.74360098134601e-05,
"loss": 1.7748,
"step": 548
},
{
"epoch": 0.13257667230137649,
"grad_norm": 0.33741897344589233,
"learning_rate": 7.7425090034339e-05,
"loss": 1.919,
"step": 549
},
{
"epoch": 0.13281815986476697,
"grad_norm": 0.31624868512153625,
"learning_rate": 7.741414782474179e-05,
"loss": 1.6449,
"step": 550
},
{
"epoch": 0.13305964742815746,
"grad_norm": 0.3361668884754181,
"learning_rate": 7.740318319122661e-05,
"loss": 2.002,
"step": 551
},
{
"epoch": 0.13330113499154794,
"grad_norm": 0.3008631765842438,
"learning_rate": 7.739219614036504e-05,
"loss": 1.7912,
"step": 552
},
{
"epoch": 0.13354262255493843,
"grad_norm": 0.3018030524253845,
"learning_rate": 7.738118667874208e-05,
"loss": 1.7734,
"step": 553
},
{
"epoch": 0.1337841101183289,
"grad_norm": 0.31376197934150696,
"learning_rate": 7.737015481295618e-05,
"loss": 1.8312,
"step": 554
},
{
"epoch": 0.1340255976817194,
"grad_norm": 0.30704107880592346,
"learning_rate": 7.735910054961924e-05,
"loss": 1.7782,
"step": 555
},
{
"epoch": 0.13426708524510989,
"grad_norm": 0.309739351272583,
"learning_rate": 7.734802389535652e-05,
"loss": 1.7463,
"step": 556
},
{
"epoch": 0.13450857280850037,
"grad_norm": 0.30903351306915283,
"learning_rate": 7.733692485680677e-05,
"loss": 1.7354,
"step": 557
},
{
"epoch": 0.13475006037189086,
"grad_norm": 0.3019959330558777,
"learning_rate": 7.73258034406221e-05,
"loss": 1.6105,
"step": 558
},
{
"epoch": 0.13499154793528134,
"grad_norm": 0.30389368534088135,
"learning_rate": 7.731465965346809e-05,
"loss": 1.681,
"step": 559
},
{
"epoch": 0.13523303549867183,
"grad_norm": 0.33595752716064453,
"learning_rate": 7.730349350202366e-05,
"loss": 1.9905,
"step": 560
},
{
"epoch": 0.1354745230620623,
"grad_norm": 0.31182244420051575,
"learning_rate": 7.729230499298118e-05,
"loss": 1.7488,
"step": 561
},
{
"epoch": 0.1357160106254528,
"grad_norm": 0.30236539244651794,
"learning_rate": 7.72810941330464e-05,
"loss": 1.8256,
"step": 562
},
{
"epoch": 0.13595749818884328,
"grad_norm": 0.3096561133861542,
"learning_rate": 7.72698609289385e-05,
"loss": 1.6412,
"step": 563
},
{
"epoch": 0.13619898575223377,
"grad_norm": 0.317247599363327,
"learning_rate": 7.725860538739e-05,
"loss": 1.8292,
"step": 564
},
{
"epoch": 0.13644047331562426,
"grad_norm": 0.3323989808559418,
"learning_rate": 7.724732751514684e-05,
"loss": 2.0581,
"step": 565
},
{
"epoch": 0.13668196087901474,
"grad_norm": 0.3129768669605255,
"learning_rate": 7.723602731896833e-05,
"loss": 1.8479,
"step": 566
},
{
"epoch": 0.13692344844240523,
"grad_norm": 0.3054257035255432,
"learning_rate": 7.722470480562717e-05,
"loss": 1.7895,
"step": 567
},
{
"epoch": 0.1371649360057957,
"grad_norm": 0.3174823820590973,
"learning_rate": 7.721335998190944e-05,
"loss": 1.7581,
"step": 568
},
{
"epoch": 0.1374064235691862,
"grad_norm": 0.3012676239013672,
"learning_rate": 7.720199285461459e-05,
"loss": 1.751,
"step": 569
},
{
"epoch": 0.13764791113257668,
"grad_norm": 0.30346378684043884,
"learning_rate": 7.719060343055541e-05,
"loss": 1.6166,
"step": 570
},
{
"epoch": 0.13788939869596717,
"grad_norm": 0.3010426163673401,
"learning_rate": 7.717919171655809e-05,
"loss": 1.661,
"step": 571
},
{
"epoch": 0.13813088625935765,
"grad_norm": 0.32389774918556213,
"learning_rate": 7.716775771946214e-05,
"loss": 1.8483,
"step": 572
},
{
"epoch": 0.13837237382274814,
"grad_norm": 0.33098238706588745,
"learning_rate": 7.71563014461205e-05,
"loss": 1.8177,
"step": 573
},
{
"epoch": 0.13861386138613863,
"grad_norm": 0.3251645565032959,
"learning_rate": 7.714482290339936e-05,
"loss": 1.922,
"step": 574
},
{
"epoch": 0.1388553489495291,
"grad_norm": 0.3154045045375824,
"learning_rate": 7.713332209817832e-05,
"loss": 1.6444,
"step": 575
},
{
"epoch": 0.1390968365129196,
"grad_norm": 0.32885581254959106,
"learning_rate": 7.712179903735033e-05,
"loss": 1.699,
"step": 576
},
{
"epoch": 0.13933832407631008,
"grad_norm": 0.3207506835460663,
"learning_rate": 7.711025372782164e-05,
"loss": 1.9586,
"step": 577
},
{
"epoch": 0.13957981163970057,
"grad_norm": 0.30934199690818787,
"learning_rate": 7.709868617651186e-05,
"loss": 1.6781,
"step": 578
},
{
"epoch": 0.13982129920309105,
"grad_norm": 0.35921943187713623,
"learning_rate": 7.708709639035394e-05,
"loss": 2.1063,
"step": 579
},
{
"epoch": 0.14006278676648154,
"grad_norm": 0.2932882010936737,
"learning_rate": 7.707548437629411e-05,
"loss": 1.7951,
"step": 580
},
{
"epoch": 0.14030427432987203,
"grad_norm": 0.29395684599876404,
"learning_rate": 7.706385014129198e-05,
"loss": 1.5773,
"step": 581
},
{
"epoch": 0.1405457618932625,
"grad_norm": 0.31152844429016113,
"learning_rate": 7.705219369232041e-05,
"loss": 1.8562,
"step": 582
},
{
"epoch": 0.14078724945665297,
"grad_norm": 0.312266081571579,
"learning_rate": 7.704051503636566e-05,
"loss": 1.6907,
"step": 583
},
{
"epoch": 0.14102873702004345,
"grad_norm": 0.3400667905807495,
"learning_rate": 7.702881418042723e-05,
"loss": 1.7507,
"step": 584
},
{
"epoch": 0.14127022458343394,
"grad_norm": 0.3202289640903473,
"learning_rate": 7.701709113151795e-05,
"loss": 1.7275,
"step": 585
},
{
"epoch": 0.14151171214682443,
"grad_norm": 0.32572951912879944,
"learning_rate": 7.700534589666397e-05,
"loss": 1.8505,
"step": 586
},
{
"epoch": 0.1417531997102149,
"grad_norm": 0.3211197555065155,
"learning_rate": 7.699357848290469e-05,
"loss": 1.7782,
"step": 587
},
{
"epoch": 0.1419946872736054,
"grad_norm": 0.318483829498291,
"learning_rate": 7.698178889729286e-05,
"loss": 1.869,
"step": 588
},
{
"epoch": 0.14223617483699588,
"grad_norm": 0.29991650581359863,
"learning_rate": 7.696997714689445e-05,
"loss": 1.7344,
"step": 589
},
{
"epoch": 0.14247766240038637,
"grad_norm": 0.32764291763305664,
"learning_rate": 7.695814323878878e-05,
"loss": 1.9262,
"step": 590
},
{
"epoch": 0.14271914996377685,
"grad_norm": 0.3038786053657532,
"learning_rate": 7.694628718006843e-05,
"loss": 1.6972,
"step": 591
},
{
"epoch": 0.14296063752716734,
"grad_norm": 0.3058617115020752,
"learning_rate": 7.693440897783923e-05,
"loss": 1.7624,
"step": 592
},
{
"epoch": 0.14320212509055782,
"grad_norm": 0.29824337363243103,
"learning_rate": 7.692250863922031e-05,
"loss": 1.6855,
"step": 593
},
{
"epoch": 0.1434436126539483,
"grad_norm": 0.3037334084510803,
"learning_rate": 7.691058617134406e-05,
"loss": 1.7016,
"step": 594
},
{
"epoch": 0.1436851002173388,
"grad_norm": 0.3068223297595978,
"learning_rate": 7.689864158135612e-05,
"loss": 1.6969,
"step": 595
},
{
"epoch": 0.14392658778072928,
"grad_norm": 0.2943803369998932,
"learning_rate": 7.688667487641541e-05,
"loss": 1.6967,
"step": 596
},
{
"epoch": 0.14416807534411977,
"grad_norm": 0.3013668954372406,
"learning_rate": 7.687468606369409e-05,
"loss": 1.8011,
"step": 597
},
{
"epoch": 0.14440956290751025,
"grad_norm": 0.30205124616622925,
"learning_rate": 7.686267515037758e-05,
"loss": 1.672,
"step": 598
},
{
"epoch": 0.14465105047090074,
"grad_norm": 0.3033943474292755,
"learning_rate": 7.685064214366453e-05,
"loss": 1.8067,
"step": 599
},
{
"epoch": 0.14489253803429122,
"grad_norm": 0.30258500576019287,
"learning_rate": 7.683858705076684e-05,
"loss": 1.8625,
"step": 600
},
{
"epoch": 0.1451340255976817,
"grad_norm": 0.30624958872795105,
"learning_rate": 7.682650987890967e-05,
"loss": 1.8142,
"step": 601
},
{
"epoch": 0.1453755131610722,
"grad_norm": 0.30320626497268677,
"learning_rate": 7.681441063533138e-05,
"loss": 1.6951,
"step": 602
},
{
"epoch": 0.14561700072446268,
"grad_norm": 0.2955961525440216,
"learning_rate": 7.680228932728357e-05,
"loss": 1.6897,
"step": 603
},
{
"epoch": 0.14585848828785317,
"grad_norm": 0.3333013951778412,
"learning_rate": 7.679014596203104e-05,
"loss": 1.8817,
"step": 604
},
{
"epoch": 0.14609997585124365,
"grad_norm": 0.30491143465042114,
"learning_rate": 7.677798054685187e-05,
"loss": 1.6913,
"step": 605
},
{
"epoch": 0.14634146341463414,
"grad_norm": 0.30941450595855713,
"learning_rate": 7.676579308903732e-05,
"loss": 1.8067,
"step": 606
},
{
"epoch": 0.14658295097802462,
"grad_norm": 0.3031584620475769,
"learning_rate": 7.675358359589183e-05,
"loss": 1.8378,
"step": 607
},
{
"epoch": 0.1468244385414151,
"grad_norm": 0.32273533940315247,
"learning_rate": 7.67413520747331e-05,
"loss": 1.76,
"step": 608
},
{
"epoch": 0.1470659261048056,
"grad_norm": 0.3251398801803589,
"learning_rate": 7.6729098532892e-05,
"loss": 1.8813,
"step": 609
},
{
"epoch": 0.14730741366819608,
"grad_norm": 0.3235621750354767,
"learning_rate": 7.671682297771263e-05,
"loss": 1.9019,
"step": 610
},
{
"epoch": 0.14754890123158657,
"grad_norm": 0.30117496848106384,
"learning_rate": 7.670452541655224e-05,
"loss": 1.7701,
"step": 611
},
{
"epoch": 0.14779038879497705,
"grad_norm": 0.32307854294776917,
"learning_rate": 7.669220585678128e-05,
"loss": 1.908,
"step": 612
},
{
"epoch": 0.14803187635836754,
"grad_norm": 0.3234044015407562,
"learning_rate": 7.667986430578343e-05,
"loss": 2.0091,
"step": 613
},
{
"epoch": 0.14827336392175802,
"grad_norm": 0.3080267906188965,
"learning_rate": 7.666750077095548e-05,
"loss": 1.8048,
"step": 614
},
{
"epoch": 0.1485148514851485,
"grad_norm": 0.3124663233757019,
"learning_rate": 7.665511525970745e-05,
"loss": 1.8464,
"step": 615
},
{
"epoch": 0.148756339048539,
"grad_norm": 0.31696856021881104,
"learning_rate": 7.664270777946252e-05,
"loss": 1.8327,
"step": 616
},
{
"epoch": 0.14899782661192948,
"grad_norm": 0.3064039349555969,
"learning_rate": 7.663027833765702e-05,
"loss": 1.7434,
"step": 617
},
{
"epoch": 0.14923931417531996,
"grad_norm": 0.30550166964530945,
"learning_rate": 7.661782694174044e-05,
"loss": 1.6736,
"step": 618
},
{
"epoch": 0.14948080173871045,
"grad_norm": 0.3370753228664398,
"learning_rate": 7.660535359917547e-05,
"loss": 1.7706,
"step": 619
},
{
"epoch": 0.14972228930210094,
"grad_norm": 0.3034164309501648,
"learning_rate": 7.659285831743789e-05,
"loss": 1.6429,
"step": 620
},
{
"epoch": 0.14996377686549142,
"grad_norm": 0.32384395599365234,
"learning_rate": 7.65803411040167e-05,
"loss": 1.9318,
"step": 621
},
{
"epoch": 0.1502052644288819,
"grad_norm": 0.3539518117904663,
"learning_rate": 7.656780196641397e-05,
"loss": 2.1674,
"step": 622
},
{
"epoch": 0.1504467519922724,
"grad_norm": 0.3114670217037201,
"learning_rate": 7.655524091214497e-05,
"loss": 1.8364,
"step": 623
},
{
"epoch": 0.15068823955566288,
"grad_norm": 0.29424378275871277,
"learning_rate": 7.65426579487381e-05,
"loss": 1.6477,
"step": 624
},
{
"epoch": 0.15092972711905336,
"grad_norm": 0.35713982582092285,
"learning_rate": 7.653005308373482e-05,
"loss": 1.8045,
"step": 625
},
{
"epoch": 0.15117121468244385,
"grad_norm": 0.30860796570777893,
"learning_rate": 7.651742632468984e-05,
"loss": 1.9516,
"step": 626
},
{
"epoch": 0.15141270224583434,
"grad_norm": 0.2912321984767914,
"learning_rate": 7.650477767917087e-05,
"loss": 1.6368,
"step": 627
},
{
"epoch": 0.15165418980922482,
"grad_norm": 0.30829671025276184,
"learning_rate": 7.64921071547588e-05,
"loss": 1.865,
"step": 628
},
{
"epoch": 0.1518956773726153,
"grad_norm": 0.3148002028465271,
"learning_rate": 7.647941475904765e-05,
"loss": 1.8414,
"step": 629
},
{
"epoch": 0.1521371649360058,
"grad_norm": 0.3275551199913025,
"learning_rate": 7.646670049964449e-05,
"loss": 1.7045,
"step": 630
},
{
"epoch": 0.15237865249939628,
"grad_norm": 0.3136232793331146,
"learning_rate": 7.645396438416955e-05,
"loss": 1.7327,
"step": 631
},
{
"epoch": 0.15262014006278676,
"grad_norm": 0.31527405977249146,
"learning_rate": 7.644120642025613e-05,
"loss": 1.7708,
"step": 632
},
{
"epoch": 0.15286162762617725,
"grad_norm": 0.3223884701728821,
"learning_rate": 7.64284266155506e-05,
"loss": 1.8636,
"step": 633
},
{
"epoch": 0.15310311518956773,
"grad_norm": 0.31887826323509216,
"learning_rate": 7.64156249777125e-05,
"loss": 1.8202,
"step": 634
},
{
"epoch": 0.15334460275295822,
"grad_norm": 0.35070282220840454,
"learning_rate": 7.640280151441439e-05,
"loss": 2.0275,
"step": 635
},
{
"epoch": 0.1535860903163487,
"grad_norm": 0.3492684066295624,
"learning_rate": 7.63899562333419e-05,
"loss": 1.8542,
"step": 636
},
{
"epoch": 0.1538275778797392,
"grad_norm": 0.30183926224708557,
"learning_rate": 7.637708914219378e-05,
"loss": 1.6828,
"step": 637
},
{
"epoch": 0.15406906544312968,
"grad_norm": 0.3578021824359894,
"learning_rate": 7.636420024868184e-05,
"loss": 1.8462,
"step": 638
},
{
"epoch": 0.15431055300652016,
"grad_norm": 0.3210180401802063,
"learning_rate": 7.635128956053094e-05,
"loss": 1.7725,
"step": 639
},
{
"epoch": 0.15455204056991065,
"grad_norm": 0.30064085125923157,
"learning_rate": 7.633835708547904e-05,
"loss": 1.6716,
"step": 640
},
{
"epoch": 0.15479352813330113,
"grad_norm": 0.31479454040527344,
"learning_rate": 7.63254028312771e-05,
"loss": 1.687,
"step": 641
},
{
"epoch": 0.15503501569669162,
"grad_norm": 0.3353448510169983,
"learning_rate": 7.631242680568916e-05,
"loss": 2.04,
"step": 642
},
{
"epoch": 0.1552765032600821,
"grad_norm": 0.338562548160553,
"learning_rate": 7.629942901649236e-05,
"loss": 1.9637,
"step": 643
},
{
"epoch": 0.1555179908234726,
"grad_norm": 0.3367462456226349,
"learning_rate": 7.62864094714768e-05,
"loss": 1.9682,
"step": 644
},
{
"epoch": 0.15575947838686308,
"grad_norm": 0.29863229393959045,
"learning_rate": 7.627336817844565e-05,
"loss": 1.6644,
"step": 645
},
{
"epoch": 0.15600096595025356,
"grad_norm": 0.31191080808639526,
"learning_rate": 7.626030514521516e-05,
"loss": 1.7951,
"step": 646
},
{
"epoch": 0.15624245351364405,
"grad_norm": 0.3201664686203003,
"learning_rate": 7.624722037961453e-05,
"loss": 1.7746,
"step": 647
},
{
"epoch": 0.15648394107703453,
"grad_norm": 0.3075219988822937,
"learning_rate": 7.623411388948606e-05,
"loss": 1.6502,
"step": 648
},
{
"epoch": 0.15672542864042502,
"grad_norm": 0.31997132301330566,
"learning_rate": 7.622098568268502e-05,
"loss": 1.9077,
"step": 649
},
{
"epoch": 0.1569669162038155,
"grad_norm": 0.3383060097694397,
"learning_rate": 7.620783576707971e-05,
"loss": 1.8237,
"step": 650
},
{
"epoch": 0.157208403767206,
"grad_norm": 0.29756960272789,
"learning_rate": 7.619466415055146e-05,
"loss": 1.6257,
"step": 651
},
{
"epoch": 0.15744989133059648,
"grad_norm": 0.29845669865608215,
"learning_rate": 7.618147084099455e-05,
"loss": 1.7794,
"step": 652
},
{
"epoch": 0.15769137889398696,
"grad_norm": 0.2993009090423584,
"learning_rate": 7.616825584631635e-05,
"loss": 1.7485,
"step": 653
},
{
"epoch": 0.15793286645737745,
"grad_norm": 0.29437655210494995,
"learning_rate": 7.615501917443715e-05,
"loss": 1.7053,
"step": 654
},
{
"epoch": 0.15817435402076793,
"grad_norm": 0.29206910729408264,
"learning_rate": 7.614176083329028e-05,
"loss": 1.6886,
"step": 655
},
{
"epoch": 0.15841584158415842,
"grad_norm": 0.3151334524154663,
"learning_rate": 7.6128480830822e-05,
"loss": 1.8169,
"step": 656
},
{
"epoch": 0.1586573291475489,
"grad_norm": 0.48860466480255127,
"learning_rate": 7.611517917499164e-05,
"loss": 2.2244,
"step": 657
},
{
"epoch": 0.1588988167109394,
"grad_norm": 0.2999897003173828,
"learning_rate": 7.610185587377143e-05,
"loss": 1.6181,
"step": 658
},
{
"epoch": 0.15914030427432987,
"grad_norm": 0.3048444390296936,
"learning_rate": 7.608851093514659e-05,
"loss": 1.8048,
"step": 659
},
{
"epoch": 0.15938179183772036,
"grad_norm": 0.3159238398075104,
"learning_rate": 7.607514436711534e-05,
"loss": 1.7586,
"step": 660
},
{
"epoch": 0.15962327940111085,
"grad_norm": 0.32997772097587585,
"learning_rate": 7.606175617768884e-05,
"loss": 1.8612,
"step": 661
},
{
"epoch": 0.15986476696450133,
"grad_norm": 0.30874085426330566,
"learning_rate": 7.60483463748912e-05,
"loss": 1.8608,
"step": 662
},
{
"epoch": 0.16010625452789182,
"grad_norm": 0.3253762722015381,
"learning_rate": 7.603491496675951e-05,
"loss": 1.9862,
"step": 663
},
{
"epoch": 0.1603477420912823,
"grad_norm": 0.3072294592857361,
"learning_rate": 7.602146196134378e-05,
"loss": 1.6203,
"step": 664
},
{
"epoch": 0.1605892296546728,
"grad_norm": 0.32605570554733276,
"learning_rate": 7.6007987366707e-05,
"loss": 1.7996,
"step": 665
},
{
"epoch": 0.16083071721806327,
"grad_norm": 0.2969420254230499,
"learning_rate": 7.599449119092504e-05,
"loss": 1.7149,
"step": 666
},
{
"epoch": 0.16107220478145376,
"grad_norm": 0.3176361918449402,
"learning_rate": 7.598097344208679e-05,
"loss": 1.7544,
"step": 667
},
{
"epoch": 0.16131369234484425,
"grad_norm": 0.3468224108219147,
"learning_rate": 7.596743412829398e-05,
"loss": 1.6971,
"step": 668
},
{
"epoch": 0.16155517990823473,
"grad_norm": 0.3290475308895111,
"learning_rate": 7.595387325766133e-05,
"loss": 1.7264,
"step": 669
},
{
"epoch": 0.16179666747162522,
"grad_norm": 0.31547752022743225,
"learning_rate": 7.594029083831644e-05,
"loss": 1.7264,
"step": 670
},
{
"epoch": 0.1620381550350157,
"grad_norm": 0.3173413872718811,
"learning_rate": 7.592668687839987e-05,
"loss": 1.7354,
"step": 671
},
{
"epoch": 0.1622796425984062,
"grad_norm": 0.35088086128234863,
"learning_rate": 7.591306138606502e-05,
"loss": 1.6187,
"step": 672
},
{
"epoch": 0.16252113016179667,
"grad_norm": 0.31287261843681335,
"learning_rate": 7.589941436947828e-05,
"loss": 1.7694,
"step": 673
},
{
"epoch": 0.16276261772518716,
"grad_norm": 0.35935917496681213,
"learning_rate": 7.588574583681888e-05,
"loss": 1.9953,
"step": 674
},
{
"epoch": 0.16300410528857764,
"grad_norm": 0.32228848338127136,
"learning_rate": 7.587205579627896e-05,
"loss": 1.8309,
"step": 675
},
{
"epoch": 0.16324559285196813,
"grad_norm": 0.324415385723114,
"learning_rate": 7.585834425606355e-05,
"loss": 1.8214,
"step": 676
},
{
"epoch": 0.16348708041535862,
"grad_norm": 0.31052157282829285,
"learning_rate": 7.584461122439057e-05,
"loss": 1.6383,
"step": 677
},
{
"epoch": 0.1637285679787491,
"grad_norm": 0.30478107929229736,
"learning_rate": 7.583085670949083e-05,
"loss": 1.6576,
"step": 678
},
{
"epoch": 0.1639700555421396,
"grad_norm": 0.29964715242385864,
"learning_rate": 7.581708071960801e-05,
"loss": 1.6084,
"step": 679
},
{
"epoch": 0.16421154310553007,
"grad_norm": 0.3476538360118866,
"learning_rate": 7.580328326299863e-05,
"loss": 1.9535,
"step": 680
},
{
"epoch": 0.16445303066892056,
"grad_norm": 0.32849040627479553,
"learning_rate": 7.578946434793215e-05,
"loss": 1.8971,
"step": 681
},
{
"epoch": 0.16469451823231104,
"grad_norm": 0.3057873845100403,
"learning_rate": 7.577562398269079e-05,
"loss": 1.648,
"step": 682
},
{
"epoch": 0.16493600579570153,
"grad_norm": 0.2937708795070648,
"learning_rate": 7.576176217556972e-05,
"loss": 1.6217,
"step": 683
},
{
"epoch": 0.16517749335909201,
"grad_norm": 0.30420657992362976,
"learning_rate": 7.57478789348769e-05,
"loss": 1.7992,
"step": 684
},
{
"epoch": 0.1654189809224825,
"grad_norm": 0.31237706542015076,
"learning_rate": 7.573397426893316e-05,
"loss": 1.7492,
"step": 685
},
{
"epoch": 0.16566046848587299,
"grad_norm": 0.36982595920562744,
"learning_rate": 7.572004818607218e-05,
"loss": 1.7512,
"step": 686
},
{
"epoch": 0.16590195604926347,
"grad_norm": 0.2950628697872162,
"learning_rate": 7.570610069464045e-05,
"loss": 1.7111,
"step": 687
},
{
"epoch": 0.16614344361265396,
"grad_norm": 0.30206093192100525,
"learning_rate": 7.569213180299732e-05,
"loss": 1.8203,
"step": 688
},
{
"epoch": 0.16638493117604444,
"grad_norm": 0.3030879497528076,
"learning_rate": 7.567814151951493e-05,
"loss": 1.7221,
"step": 689
},
{
"epoch": 0.16662641873943493,
"grad_norm": 0.3175910711288452,
"learning_rate": 7.566412985257826e-05,
"loss": 1.783,
"step": 690
},
{
"epoch": 0.16686790630282541,
"grad_norm": 0.3054318130016327,
"learning_rate": 7.565009681058514e-05,
"loss": 1.679,
"step": 691
},
{
"epoch": 0.1671093938662159,
"grad_norm": 0.3093288242816925,
"learning_rate": 7.563604240194616e-05,
"loss": 1.778,
"step": 692
},
{
"epoch": 0.16735088142960639,
"grad_norm": 0.3029101490974426,
"learning_rate": 7.562196663508473e-05,
"loss": 1.7636,
"step": 693
},
{
"epoch": 0.16759236899299687,
"grad_norm": 0.3081244230270386,
"learning_rate": 7.56078695184371e-05,
"loss": 1.8207,
"step": 694
},
{
"epoch": 0.16783385655638736,
"grad_norm": 0.30802908539772034,
"learning_rate": 7.559375106045223e-05,
"loss": 1.7582,
"step": 695
},
{
"epoch": 0.16807534411977784,
"grad_norm": 0.32356002926826477,
"learning_rate": 7.557961126959194e-05,
"loss": 1.8012,
"step": 696
},
{
"epoch": 0.16831683168316833,
"grad_norm": 0.3083191514015198,
"learning_rate": 7.556545015433084e-05,
"loss": 1.6644,
"step": 697
},
{
"epoch": 0.1685583192465588,
"grad_norm": 0.3402654528617859,
"learning_rate": 7.555126772315629e-05,
"loss": 1.8862,
"step": 698
},
{
"epoch": 0.1687998068099493,
"grad_norm": 0.3095254898071289,
"learning_rate": 7.553706398456841e-05,
"loss": 1.7341,
"step": 699
},
{
"epoch": 0.16904129437333978,
"grad_norm": 0.30369865894317627,
"learning_rate": 7.552283894708015e-05,
"loss": 1.7315,
"step": 700
},
{
"epoch": 0.16928278193673027,
"grad_norm": 0.319938600063324,
"learning_rate": 7.550859261921719e-05,
"loss": 1.7972,
"step": 701
},
{
"epoch": 0.16952426950012076,
"grad_norm": 0.299113392829895,
"learning_rate": 7.549432500951796e-05,
"loss": 1.7532,
"step": 702
},
{
"epoch": 0.16976575706351124,
"grad_norm": 0.29605212807655334,
"learning_rate": 7.548003612653362e-05,
"loss": 1.7625,
"step": 703
},
{
"epoch": 0.17000724462690173,
"grad_norm": 0.3049871325492859,
"learning_rate": 7.546572597882818e-05,
"loss": 1.7958,
"step": 704
},
{
"epoch": 0.1702487321902922,
"grad_norm": 0.30870726704597473,
"learning_rate": 7.545139457497829e-05,
"loss": 1.7153,
"step": 705
},
{
"epoch": 0.1704902197536827,
"grad_norm": 0.31261366605758667,
"learning_rate": 7.54370419235734e-05,
"loss": 1.704,
"step": 706
},
{
"epoch": 0.17073170731707318,
"grad_norm": 0.32341545820236206,
"learning_rate": 7.542266803321564e-05,
"loss": 1.6498,
"step": 707
},
{
"epoch": 0.17097319488046367,
"grad_norm": 0.3037106990814209,
"learning_rate": 7.540827291251996e-05,
"loss": 1.726,
"step": 708
},
{
"epoch": 0.17121468244385415,
"grad_norm": 0.2945062220096588,
"learning_rate": 7.539385657011393e-05,
"loss": 1.6776,
"step": 709
},
{
"epoch": 0.17145617000724464,
"grad_norm": 0.3037776052951813,
"learning_rate": 7.537941901463791e-05,
"loss": 1.7051,
"step": 710
},
{
"epoch": 0.17169765757063513,
"grad_norm": 0.34461262822151184,
"learning_rate": 7.536496025474496e-05,
"loss": 1.5792,
"step": 711
},
{
"epoch": 0.17193914513402558,
"grad_norm": 0.2971360981464386,
"learning_rate": 7.535048029910081e-05,
"loss": 1.7157,
"step": 712
},
{
"epoch": 0.17218063269741607,
"grad_norm": 0.3049238324165344,
"learning_rate": 7.533597915638397e-05,
"loss": 1.8328,
"step": 713
},
{
"epoch": 0.17242212026080656,
"grad_norm": 0.29996106028556824,
"learning_rate": 7.532145683528555e-05,
"loss": 1.7274,
"step": 714
},
{
"epoch": 0.17266360782419704,
"grad_norm": 0.3050224483013153,
"learning_rate": 7.530691334450945e-05,
"loss": 1.6866,
"step": 715
},
{
"epoch": 0.17290509538758753,
"grad_norm": 0.3068046569824219,
"learning_rate": 7.529234869277219e-05,
"loss": 1.792,
"step": 716
},
{
"epoch": 0.173146582950978,
"grad_norm": 0.3204353451728821,
"learning_rate": 7.5277762888803e-05,
"loss": 1.7847,
"step": 717
},
{
"epoch": 0.1733880705143685,
"grad_norm": 0.4433777928352356,
"learning_rate": 7.526315594134378e-05,
"loss": 1.762,
"step": 718
},
{
"epoch": 0.17362955807775898,
"grad_norm": 0.3121720254421234,
"learning_rate": 7.524852785914911e-05,
"loss": 1.7186,
"step": 719
},
{
"epoch": 0.17387104564114947,
"grad_norm": 0.3399069309234619,
"learning_rate": 7.523387865098624e-05,
"loss": 1.8693,
"step": 720
},
{
"epoch": 0.17411253320453995,
"grad_norm": 0.3379225432872772,
"learning_rate": 7.521920832563506e-05,
"loss": 1.7691,
"step": 721
},
{
"epoch": 0.17435402076793044,
"grad_norm": 0.30595719814300537,
"learning_rate": 7.520451689188814e-05,
"loss": 1.726,
"step": 722
},
{
"epoch": 0.17459550833132093,
"grad_norm": 0.29468265175819397,
"learning_rate": 7.518980435855071e-05,
"loss": 1.673,
"step": 723
},
{
"epoch": 0.1748369958947114,
"grad_norm": 0.3205685019493103,
"learning_rate": 7.517507073444059e-05,
"loss": 1.9188,
"step": 724
},
{
"epoch": 0.1750784834581019,
"grad_norm": 0.32377034425735474,
"learning_rate": 7.51603160283883e-05,
"loss": 1.7882,
"step": 725
},
{
"epoch": 0.17531997102149238,
"grad_norm": 0.32858628034591675,
"learning_rate": 7.514554024923697e-05,
"loss": 1.8163,
"step": 726
},
{
"epoch": 0.17556145858488287,
"grad_norm": 0.30413132905960083,
"learning_rate": 7.513074340584237e-05,
"loss": 1.6486,
"step": 727
},
{
"epoch": 0.17580294614827335,
"grad_norm": 0.30543509125709534,
"learning_rate": 7.511592550707286e-05,
"loss": 1.6792,
"step": 728
},
{
"epoch": 0.17604443371166384,
"grad_norm": 0.3092809319496155,
"learning_rate": 7.51010865618095e-05,
"loss": 1.7791,
"step": 729
},
{
"epoch": 0.17628592127505432,
"grad_norm": 0.32701924443244934,
"learning_rate": 7.508622657894588e-05,
"loss": 1.6883,
"step": 730
},
{
"epoch": 0.1765274088384448,
"grad_norm": 0.33039215207099915,
"learning_rate": 7.507134556738822e-05,
"loss": 1.9009,
"step": 731
},
{
"epoch": 0.1767688964018353,
"grad_norm": 0.3000987470149994,
"learning_rate": 7.505644353605538e-05,
"loss": 1.7143,
"step": 732
},
{
"epoch": 0.17701038396522578,
"grad_norm": 0.3035810589790344,
"learning_rate": 7.504152049387878e-05,
"loss": 1.6682,
"step": 733
},
{
"epoch": 0.17725187152861627,
"grad_norm": 0.30469194054603577,
"learning_rate": 7.502657644980244e-05,
"loss": 1.7519,
"step": 734
},
{
"epoch": 0.17749335909200675,
"grad_norm": 0.30051693320274353,
"learning_rate": 7.501161141278298e-05,
"loss": 1.7051,
"step": 735
},
{
"epoch": 0.17773484665539724,
"grad_norm": 0.31448641419410706,
"learning_rate": 7.499662539178958e-05,
"loss": 1.674,
"step": 736
},
{
"epoch": 0.17797633421878772,
"grad_norm": 0.321920245885849,
"learning_rate": 7.498161839580405e-05,
"loss": 1.7703,
"step": 737
},
{
"epoch": 0.1782178217821782,
"grad_norm": 0.34229952096939087,
"learning_rate": 7.496659043382069e-05,
"loss": 1.7585,
"step": 738
},
{
"epoch": 0.1784593093455687,
"grad_norm": 0.2906430661678314,
"learning_rate": 7.495154151484644e-05,
"loss": 1.6548,
"step": 739
},
{
"epoch": 0.17870079690895918,
"grad_norm": 0.30244144797325134,
"learning_rate": 7.493647164790074e-05,
"loss": 1.7009,
"step": 740
},
{
"epoch": 0.17894228447234967,
"grad_norm": 0.31220030784606934,
"learning_rate": 7.492138084201561e-05,
"loss": 1.8037,
"step": 741
},
{
"epoch": 0.17918377203574015,
"grad_norm": 0.2887391149997711,
"learning_rate": 7.490626910623566e-05,
"loss": 1.6375,
"step": 742
},
{
"epoch": 0.17942525959913064,
"grad_norm": 0.3369121849536896,
"learning_rate": 7.489113644961797e-05,
"loss": 1.8906,
"step": 743
},
{
"epoch": 0.17966674716252112,
"grad_norm": 0.3304523527622223,
"learning_rate": 7.487598288123222e-05,
"loss": 1.8778,
"step": 744
},
{
"epoch": 0.1799082347259116,
"grad_norm": 0.3136029541492462,
"learning_rate": 7.486080841016059e-05,
"loss": 1.7777,
"step": 745
},
{
"epoch": 0.1801497222893021,
"grad_norm": 0.3245154619216919,
"learning_rate": 7.48456130454978e-05,
"loss": 1.7184,
"step": 746
},
{
"epoch": 0.18039120985269258,
"grad_norm": 0.2960347533226013,
"learning_rate": 7.48303967963511e-05,
"loss": 1.6104,
"step": 747
},
{
"epoch": 0.18063269741608307,
"grad_norm": 0.31395086646080017,
"learning_rate": 7.481515967184021e-05,
"loss": 1.8418,
"step": 748
},
{
"epoch": 0.18087418497947355,
"grad_norm": 0.30373284220695496,
"learning_rate": 7.479990168109744e-05,
"loss": 1.7451,
"step": 749
},
{
"epoch": 0.18111567254286404,
"grad_norm": 0.3072919249534607,
"learning_rate": 7.478462283326754e-05,
"loss": 1.7898,
"step": 750
},
{
"epoch": 0.18135716010625452,
"grad_norm": 0.2961108386516571,
"learning_rate": 7.476932313750779e-05,
"loss": 1.6443,
"step": 751
},
{
"epoch": 0.181598647669645,
"grad_norm": 0.3386465609073639,
"learning_rate": 7.475400260298797e-05,
"loss": 1.9018,
"step": 752
},
{
"epoch": 0.1818401352330355,
"grad_norm": 0.3179508447647095,
"learning_rate": 7.473866123889032e-05,
"loss": 1.7945,
"step": 753
},
{
"epoch": 0.18208162279642598,
"grad_norm": 0.30372482538223267,
"learning_rate": 7.472329905440961e-05,
"loss": 1.7731,
"step": 754
},
{
"epoch": 0.18232311035981646,
"grad_norm": 0.2982485890388489,
"learning_rate": 7.470791605875302e-05,
"loss": 1.7926,
"step": 755
},
{
"epoch": 0.18256459792320695,
"grad_norm": 0.3064810633659363,
"learning_rate": 7.46925122611403e-05,
"loss": 1.7192,
"step": 756
},
{
"epoch": 0.18280608548659744,
"grad_norm": 0.3069106340408325,
"learning_rate": 7.467708767080358e-05,
"loss": 1.7361,
"step": 757
},
{
"epoch": 0.18304757304998792,
"grad_norm": 0.31539079546928406,
"learning_rate": 7.466164229698747e-05,
"loss": 1.7761,
"step": 758
},
{
"epoch": 0.1832890606133784,
"grad_norm": 0.3114735782146454,
"learning_rate": 7.464617614894908e-05,
"loss": 1.8215,
"step": 759
},
{
"epoch": 0.1835305481767689,
"grad_norm": 0.3202139437198639,
"learning_rate": 7.463068923595792e-05,
"loss": 1.7645,
"step": 760
},
{
"epoch": 0.18377203574015938,
"grad_norm": 0.2983264625072479,
"learning_rate": 7.461518156729599e-05,
"loss": 1.8844,
"step": 761
},
{
"epoch": 0.18401352330354986,
"grad_norm": 0.32856181263923645,
"learning_rate": 7.45996531522577e-05,
"loss": 1.9035,
"step": 762
},
{
"epoch": 0.18425501086694035,
"grad_norm": 0.31148043274879456,
"learning_rate": 7.45841040001499e-05,
"loss": 1.837,
"step": 763
},
{
"epoch": 0.18449649843033084,
"grad_norm": 0.30822792649269104,
"learning_rate": 7.456853412029184e-05,
"loss": 1.7931,
"step": 764
},
{
"epoch": 0.18473798599372132,
"grad_norm": 0.3262118101119995,
"learning_rate": 7.455294352201528e-05,
"loss": 1.8056,
"step": 765
},
{
"epoch": 0.1849794735571118,
"grad_norm": 0.32270196080207825,
"learning_rate": 7.453733221466429e-05,
"loss": 1.8264,
"step": 766
},
{
"epoch": 0.1852209611205023,
"grad_norm": 0.32929688692092896,
"learning_rate": 7.452170020759542e-05,
"loss": 1.8021,
"step": 767
},
{
"epoch": 0.18546244868389278,
"grad_norm": 0.3017376661300659,
"learning_rate": 7.450604751017762e-05,
"loss": 1.7503,
"step": 768
},
{
"epoch": 0.18570393624728326,
"grad_norm": 0.3037002980709076,
"learning_rate": 7.449037413179222e-05,
"loss": 1.6066,
"step": 769
},
{
"epoch": 0.18594542381067375,
"grad_norm": 0.3149946630001068,
"learning_rate": 7.447468008183295e-05,
"loss": 1.7274,
"step": 770
},
{
"epoch": 0.18618691137406423,
"grad_norm": 0.3143094480037689,
"learning_rate": 7.445896536970592e-05,
"loss": 1.7744,
"step": 771
},
{
"epoch": 0.18642839893745472,
"grad_norm": 0.32124435901641846,
"learning_rate": 7.444323000482968e-05,
"loss": 1.8213,
"step": 772
},
{
"epoch": 0.1866698865008452,
"grad_norm": 0.3078225255012512,
"learning_rate": 7.442747399663507e-05,
"loss": 1.6668,
"step": 773
},
{
"epoch": 0.1869113740642357,
"grad_norm": 0.3109733462333679,
"learning_rate": 7.441169735456537e-05,
"loss": 1.7679,
"step": 774
},
{
"epoch": 0.18715286162762618,
"grad_norm": 0.3216419816017151,
"learning_rate": 7.439590008807621e-05,
"loss": 1.8956,
"step": 775
},
{
"epoch": 0.18739434919101666,
"grad_norm": 0.3254031836986542,
"learning_rate": 7.438008220663556e-05,
"loss": 1.8686,
"step": 776
},
{
"epoch": 0.18763583675440715,
"grad_norm": 0.31165701150894165,
"learning_rate": 7.436424371972376e-05,
"loss": 1.6975,
"step": 777
},
{
"epoch": 0.18787732431779763,
"grad_norm": 0.3033682703971863,
"learning_rate": 7.43483846368335e-05,
"loss": 1.7078,
"step": 778
},
{
"epoch": 0.18811881188118812,
"grad_norm": 0.3072250783443451,
"learning_rate": 7.433250496746985e-05,
"loss": 1.6495,
"step": 779
},
{
"epoch": 0.1883602994445786,
"grad_norm": 0.3142796754837036,
"learning_rate": 7.431660472115013e-05,
"loss": 1.6211,
"step": 780
},
{
"epoch": 0.1886017870079691,
"grad_norm": 0.31042274832725525,
"learning_rate": 7.430068390740409e-05,
"loss": 1.7299,
"step": 781
},
{
"epoch": 0.18884327457135958,
"grad_norm": 0.34713032841682434,
"learning_rate": 7.428474253577372e-05,
"loss": 1.9567,
"step": 782
},
{
"epoch": 0.18908476213475006,
"grad_norm": 0.30852535367012024,
"learning_rate": 7.426878061581342e-05,
"loss": 1.8149,
"step": 783
},
{
"epoch": 0.18932624969814055,
"grad_norm": 0.30727940797805786,
"learning_rate": 7.425279815708981e-05,
"loss": 1.7005,
"step": 784
},
{
"epoch": 0.18956773726153103,
"grad_norm": 0.3225751221179962,
"learning_rate": 7.423679516918192e-05,
"loss": 1.9,
"step": 785
},
{
"epoch": 0.18980922482492152,
"grad_norm": 0.3211837112903595,
"learning_rate": 7.4220771661681e-05,
"loss": 1.7269,
"step": 786
},
{
"epoch": 0.190050712388312,
"grad_norm": 0.3182501196861267,
"learning_rate": 7.420472764419065e-05,
"loss": 1.7219,
"step": 787
},
{
"epoch": 0.1902921999517025,
"grad_norm": 0.32098039984703064,
"learning_rate": 7.418866312632673e-05,
"loss": 1.7289,
"step": 788
},
{
"epoch": 0.19053368751509298,
"grad_norm": 0.3143134117126465,
"learning_rate": 7.41725781177174e-05,
"loss": 1.7644,
"step": 789
},
{
"epoch": 0.19077517507848346,
"grad_norm": 0.3101259469985962,
"learning_rate": 7.415647262800311e-05,
"loss": 1.7912,
"step": 790
},
{
"epoch": 0.19101666264187395,
"grad_norm": 0.33307313919067383,
"learning_rate": 7.414034666683657e-05,
"loss": 1.8878,
"step": 791
},
{
"epoch": 0.19125815020526443,
"grad_norm": 0.31299763917922974,
"learning_rate": 7.412420024388279e-05,
"loss": 1.7598,
"step": 792
},
{
"epoch": 0.19149963776865492,
"grad_norm": 0.2992435693740845,
"learning_rate": 7.410803336881898e-05,
"loss": 1.6938,
"step": 793
},
{
"epoch": 0.1917411253320454,
"grad_norm": 0.3298616111278534,
"learning_rate": 7.409184605133468e-05,
"loss": 1.7812,
"step": 794
},
{
"epoch": 0.1919826128954359,
"grad_norm": 0.3269992768764496,
"learning_rate": 7.407563830113163e-05,
"loss": 1.916,
"step": 795
},
{
"epoch": 0.19222410045882637,
"grad_norm": 0.3093508780002594,
"learning_rate": 7.405941012792385e-05,
"loss": 1.8387,
"step": 796
},
{
"epoch": 0.19246558802221686,
"grad_norm": 0.34046638011932373,
"learning_rate": 7.404316154143757e-05,
"loss": 2.0301,
"step": 797
},
{
"epoch": 0.19270707558560735,
"grad_norm": 0.32975658774375916,
"learning_rate": 7.40268925514113e-05,
"loss": 1.8212,
"step": 798
},
{
"epoch": 0.19294856314899783,
"grad_norm": 0.3060869872570038,
"learning_rate": 7.401060316759574e-05,
"loss": 1.7069,
"step": 799
},
{
"epoch": 0.19319005071238832,
"grad_norm": 0.31247884035110474,
"learning_rate": 7.399429339975379e-05,
"loss": 1.8025,
"step": 800
},
{
"epoch": 0.1934315382757788,
"grad_norm": 0.31649911403656006,
"learning_rate": 7.397796325766063e-05,
"loss": 1.7576,
"step": 801
},
{
"epoch": 0.1936730258391693,
"grad_norm": 0.3336128294467926,
"learning_rate": 7.396161275110362e-05,
"loss": 1.873,
"step": 802
},
{
"epoch": 0.19391451340255977,
"grad_norm": 0.31450867652893066,
"learning_rate": 7.394524188988232e-05,
"loss": 1.8446,
"step": 803
},
{
"epoch": 0.19415600096595026,
"grad_norm": 0.3035898506641388,
"learning_rate": 7.39288506838085e-05,
"loss": 1.6569,
"step": 804
},
{
"epoch": 0.19439748852934075,
"grad_norm": 0.3701397478580475,
"learning_rate": 7.39124391427061e-05,
"loss": 1.6752,
"step": 805
},
{
"epoch": 0.19463897609273123,
"grad_norm": 0.32577869296073914,
"learning_rate": 7.389600727641131e-05,
"loss": 1.7124,
"step": 806
},
{
"epoch": 0.19488046365612172,
"grad_norm": 0.3136950433254242,
"learning_rate": 7.387955509477242e-05,
"loss": 1.7551,
"step": 807
},
{
"epoch": 0.1951219512195122,
"grad_norm": 0.31354862451553345,
"learning_rate": 7.386308260764995e-05,
"loss": 1.7758,
"step": 808
},
{
"epoch": 0.1953634387829027,
"grad_norm": 0.38379475474357605,
"learning_rate": 7.384658982491657e-05,
"loss": 1.8878,
"step": 809
},
{
"epoch": 0.19560492634629317,
"grad_norm": 0.3336881101131439,
"learning_rate": 7.383007675645712e-05,
"loss": 1.9188,
"step": 810
},
{
"epoch": 0.19584641390968366,
"grad_norm": 0.2996203303337097,
"learning_rate": 7.381354341216858e-05,
"loss": 1.5913,
"step": 811
},
{
"epoch": 0.19608790147307414,
"grad_norm": 0.33541610836982727,
"learning_rate": 7.379698980196013e-05,
"loss": 1.7095,
"step": 812
},
{
"epoch": 0.19632938903646463,
"grad_norm": 0.30465638637542725,
"learning_rate": 7.378041593575305e-05,
"loss": 1.6976,
"step": 813
},
{
"epoch": 0.19657087659985512,
"grad_norm": 0.30584558844566345,
"learning_rate": 7.376382182348076e-05,
"loss": 1.5261,
"step": 814
},
{
"epoch": 0.1968123641632456,
"grad_norm": 0.34166428446769714,
"learning_rate": 7.374720747508885e-05,
"loss": 1.9264,
"step": 815
},
{
"epoch": 0.1970538517266361,
"grad_norm": 0.3009068965911865,
"learning_rate": 7.373057290053502e-05,
"loss": 1.8388,
"step": 816
},
{
"epoch": 0.19729533929002657,
"grad_norm": 0.3169417381286621,
"learning_rate": 7.371391810978909e-05,
"loss": 1.6588,
"step": 817
},
{
"epoch": 0.19753682685341706,
"grad_norm": 0.30887770652770996,
"learning_rate": 7.369724311283296e-05,
"loss": 1.7986,
"step": 818
},
{
"epoch": 0.19777831441680754,
"grad_norm": 0.31081607937812805,
"learning_rate": 7.368054791966073e-05,
"loss": 1.6954,
"step": 819
},
{
"epoch": 0.19801980198019803,
"grad_norm": 0.3122561275959015,
"learning_rate": 7.366383254027853e-05,
"loss": 1.738,
"step": 820
},
{
"epoch": 0.19826128954358851,
"grad_norm": 0.30774828791618347,
"learning_rate": 7.36470969847046e-05,
"loss": 1.7573,
"step": 821
},
{
"epoch": 0.198502777106979,
"grad_norm": 0.29315096139907837,
"learning_rate": 7.36303412629693e-05,
"loss": 1.6716,
"step": 822
},
{
"epoch": 0.19874426467036949,
"grad_norm": 0.30455148220062256,
"learning_rate": 7.361356538511506e-05,
"loss": 1.7381,
"step": 823
},
{
"epoch": 0.19898575223375997,
"grad_norm": 0.3003849387168884,
"learning_rate": 7.359676936119635e-05,
"loss": 1.7738,
"step": 824
},
{
"epoch": 0.19922723979715046,
"grad_norm": 0.3111303746700287,
"learning_rate": 7.357995320127981e-05,
"loss": 1.8793,
"step": 825
},
{
"epoch": 0.19946872736054094,
"grad_norm": 0.31026238203048706,
"learning_rate": 7.356311691544406e-05,
"loss": 1.6743,
"step": 826
},
{
"epoch": 0.19971021492393143,
"grad_norm": 0.30901458859443665,
"learning_rate": 7.354626051377981e-05,
"loss": 1.6457,
"step": 827
},
{
"epoch": 0.19995170248732191,
"grad_norm": 0.3286570906639099,
"learning_rate": 7.352938400638986e-05,
"loss": 1.9002,
"step": 828
},
{
"epoch": 0.2001931900507124,
"grad_norm": 0.3596792221069336,
"learning_rate": 7.3512487403389e-05,
"loss": 2.0187,
"step": 829
},
{
"epoch": 0.20043467761410289,
"grad_norm": 0.325364887714386,
"learning_rate": 7.349557071490411e-05,
"loss": 1.9584,
"step": 830
},
{
"epoch": 0.20067616517749337,
"grad_norm": 0.3105791509151459,
"learning_rate": 7.347863395107411e-05,
"loss": 1.7492,
"step": 831
},
{
"epoch": 0.20091765274088386,
"grad_norm": 0.3149196207523346,
"learning_rate": 7.346167712204991e-05,
"loss": 1.7646,
"step": 832
},
{
"epoch": 0.20115914030427434,
"grad_norm": 0.31853047013282776,
"learning_rate": 7.344470023799447e-05,
"loss": 1.7379,
"step": 833
},
{
"epoch": 0.20140062786766483,
"grad_norm": 0.31346553564071655,
"learning_rate": 7.34277033090828e-05,
"loss": 1.7015,
"step": 834
},
{
"epoch": 0.2016421154310553,
"grad_norm": 0.31461572647094727,
"learning_rate": 7.341068634550185e-05,
"loss": 1.8486,
"step": 835
},
{
"epoch": 0.2018836029944458,
"grad_norm": 0.31537574529647827,
"learning_rate": 7.339364935745067e-05,
"loss": 1.6802,
"step": 836
},
{
"epoch": 0.20212509055783628,
"grad_norm": 0.3205339014530182,
"learning_rate": 7.337659235514024e-05,
"loss": 1.7981,
"step": 837
},
{
"epoch": 0.20236657812122677,
"grad_norm": 0.3003098666667938,
"learning_rate": 7.335951534879356e-05,
"loss": 1.7005,
"step": 838
},
{
"epoch": 0.20260806568461726,
"grad_norm": 0.3131178617477417,
"learning_rate": 7.334241834864562e-05,
"loss": 1.6863,
"step": 839
},
{
"epoch": 0.20284955324800774,
"grad_norm": 0.3473765552043915,
"learning_rate": 7.33253013649434e-05,
"loss": 2.0212,
"step": 840
},
{
"epoch": 0.20309104081139823,
"grad_norm": 0.2994740307331085,
"learning_rate": 7.330816440794585e-05,
"loss": 1.7631,
"step": 841
},
{
"epoch": 0.20333252837478868,
"grad_norm": 0.29169461131095886,
"learning_rate": 7.329100748792387e-05,
"loss": 1.5282,
"step": 842
},
{
"epoch": 0.20357401593817917,
"grad_norm": 0.3285301625728607,
"learning_rate": 7.327383061516035e-05,
"loss": 1.8478,
"step": 843
},
{
"epoch": 0.20381550350156966,
"grad_norm": 0.2975311279296875,
"learning_rate": 7.325663379995016e-05,
"loss": 1.6736,
"step": 844
},
{
"epoch": 0.20405699106496014,
"grad_norm": 0.30266156792640686,
"learning_rate": 7.323941705260006e-05,
"loss": 1.7203,
"step": 845
},
{
"epoch": 0.20429847862835063,
"grad_norm": 0.29713326692581177,
"learning_rate": 7.322218038342881e-05,
"loss": 1.6709,
"step": 846
},
{
"epoch": 0.2045399661917411,
"grad_norm": 0.29916587471961975,
"learning_rate": 7.320492380276711e-05,
"loss": 1.6718,
"step": 847
},
{
"epoch": 0.2047814537551316,
"grad_norm": 0.31391361355781555,
"learning_rate": 7.318764732095753e-05,
"loss": 1.8098,
"step": 848
},
{
"epoch": 0.20502294131852208,
"grad_norm": 0.3385540843009949,
"learning_rate": 7.317035094835467e-05,
"loss": 1.916,
"step": 849
},
{
"epoch": 0.20526442888191257,
"grad_norm": 0.31298068165779114,
"learning_rate": 7.315303469532494e-05,
"loss": 1.7697,
"step": 850
},
{
"epoch": 0.20550591644530306,
"grad_norm": 0.30968594551086426,
"learning_rate": 7.313569857224674e-05,
"loss": 1.7315,
"step": 851
},
{
"epoch": 0.20574740400869354,
"grad_norm": 0.301782488822937,
"learning_rate": 7.311834258951038e-05,
"loss": 1.6389,
"step": 852
},
{
"epoch": 0.20598889157208403,
"grad_norm": 0.3327527940273285,
"learning_rate": 7.310096675751802e-05,
"loss": 1.9758,
"step": 853
},
{
"epoch": 0.2062303791354745,
"grad_norm": 0.3157320022583008,
"learning_rate": 7.308357108668377e-05,
"loss": 1.9141,
"step": 854
},
{
"epoch": 0.206471866698865,
"grad_norm": 0.3155609667301178,
"learning_rate": 7.306615558743358e-05,
"loss": 1.741,
"step": 855
},
{
"epoch": 0.20671335426225548,
"grad_norm": 0.326360285282135,
"learning_rate": 7.304872027020536e-05,
"loss": 1.9365,
"step": 856
},
{
"epoch": 0.20695484182564597,
"grad_norm": 0.31044507026672363,
"learning_rate": 7.303126514544881e-05,
"loss": 1.7731,
"step": 857
},
{
"epoch": 0.20719632938903645,
"grad_norm": 0.3183859884738922,
"learning_rate": 7.301379022362556e-05,
"loss": 1.85,
"step": 858
},
{
"epoch": 0.20743781695242694,
"grad_norm": 0.31985989212989807,
"learning_rate": 7.299629551520908e-05,
"loss": 1.796,
"step": 859
},
{
"epoch": 0.20767930451581743,
"grad_norm": 0.33011680841445923,
"learning_rate": 7.297878103068471e-05,
"loss": 1.9272,
"step": 860
},
{
"epoch": 0.2079207920792079,
"grad_norm": 0.31791943311691284,
"learning_rate": 7.296124678054963e-05,
"loss": 1.5777,
"step": 861
},
{
"epoch": 0.2081622796425984,
"grad_norm": 0.29987606406211853,
"learning_rate": 7.294369277531287e-05,
"loss": 1.7308,
"step": 862
},
{
"epoch": 0.20840376720598888,
"grad_norm": 0.3212811052799225,
"learning_rate": 7.292611902549534e-05,
"loss": 1.7248,
"step": 863
},
{
"epoch": 0.20864525476937937,
"grad_norm": 0.31155189871788025,
"learning_rate": 7.290852554162972e-05,
"loss": 1.6799,
"step": 864
},
{
"epoch": 0.20888674233276985,
"grad_norm": 0.30869436264038086,
"learning_rate": 7.289091233426054e-05,
"loss": 1.7143,
"step": 865
},
{
"epoch": 0.20912822989616034,
"grad_norm": 0.30815213918685913,
"learning_rate": 7.287327941394416e-05,
"loss": 1.6345,
"step": 866
},
{
"epoch": 0.20936971745955082,
"grad_norm": 0.32789891958236694,
"learning_rate": 7.285562679124878e-05,
"loss": 1.9442,
"step": 867
},
{
"epoch": 0.2096112050229413,
"grad_norm": 0.3645618259906769,
"learning_rate": 7.283795447675435e-05,
"loss": 1.9534,
"step": 868
},
{
"epoch": 0.2098526925863318,
"grad_norm": 0.311452180147171,
"learning_rate": 7.282026248105268e-05,
"loss": 1.9428,
"step": 869
},
{
"epoch": 0.21009418014972228,
"grad_norm": 0.3184005916118622,
"learning_rate": 7.280255081474731e-05,
"loss": 1.8433,
"step": 870
},
{
"epoch": 0.21033566771311277,
"grad_norm": 0.3258514404296875,
"learning_rate": 7.278481948845364e-05,
"loss": 1.9518,
"step": 871
},
{
"epoch": 0.21057715527650325,
"grad_norm": 0.29278308153152466,
"learning_rate": 7.276706851279883e-05,
"loss": 1.6152,
"step": 872
},
{
"epoch": 0.21081864283989374,
"grad_norm": 0.30696970224380493,
"learning_rate": 7.274929789842177e-05,
"loss": 1.7308,
"step": 873
},
{
"epoch": 0.21106013040328422,
"grad_norm": 0.3068031668663025,
"learning_rate": 7.273150765597319e-05,
"loss": 1.7358,
"step": 874
},
{
"epoch": 0.2113016179666747,
"grad_norm": 0.31447917222976685,
"learning_rate": 7.271369779611553e-05,
"loss": 1.7537,
"step": 875
},
{
"epoch": 0.2115431055300652,
"grad_norm": 0.31334996223449707,
"learning_rate": 7.269586832952303e-05,
"loss": 1.7341,
"step": 876
},
{
"epoch": 0.21178459309345568,
"grad_norm": 0.2954760193824768,
"learning_rate": 7.267801926688164e-05,
"loss": 1.6958,
"step": 877
},
{
"epoch": 0.21202608065684617,
"grad_norm": 0.30289843678474426,
"learning_rate": 7.26601506188891e-05,
"loss": 1.6662,
"step": 878
},
{
"epoch": 0.21226756822023665,
"grad_norm": 0.31064343452453613,
"learning_rate": 7.264226239625484e-05,
"loss": 1.6904,
"step": 879
},
{
"epoch": 0.21250905578362714,
"grad_norm": 0.32711198925971985,
"learning_rate": 7.262435460970006e-05,
"loss": 1.8815,
"step": 880
},
{
"epoch": 0.21275054334701762,
"grad_norm": 0.314898282289505,
"learning_rate": 7.260642726995768e-05,
"loss": 1.8043,
"step": 881
},
{
"epoch": 0.2129920309104081,
"grad_norm": 0.34022215008735657,
"learning_rate": 7.25884803877723e-05,
"loss": 1.8118,
"step": 882
},
{
"epoch": 0.2132335184737986,
"grad_norm": 0.29776495695114136,
"learning_rate": 7.25705139739003e-05,
"loss": 1.6633,
"step": 883
},
{
"epoch": 0.21347500603718908,
"grad_norm": 0.3354003429412842,
"learning_rate": 7.25525280391097e-05,
"loss": 1.8794,
"step": 884
},
{
"epoch": 0.21371649360057957,
"grad_norm": 0.3095185160636902,
"learning_rate": 7.253452259418027e-05,
"loss": 1.6259,
"step": 885
},
{
"epoch": 0.21395798116397005,
"grad_norm": 0.2914026379585266,
"learning_rate": 7.251649764990343e-05,
"loss": 1.6233,
"step": 886
},
{
"epoch": 0.21419946872736054,
"grad_norm": 0.3036092221736908,
"learning_rate": 7.249845321708234e-05,
"loss": 1.7473,
"step": 887
},
{
"epoch": 0.21444095629075102,
"grad_norm": 0.31751853227615356,
"learning_rate": 7.248038930653178e-05,
"loss": 1.7744,
"step": 888
},
{
"epoch": 0.2146824438541415,
"grad_norm": 0.31027981638908386,
"learning_rate": 7.246230592907824e-05,
"loss": 1.7248,
"step": 889
},
{
"epoch": 0.214923931417532,
"grad_norm": 0.3045722246170044,
"learning_rate": 7.244420309555989e-05,
"loss": 1.7947,
"step": 890
},
{
"epoch": 0.21516541898092248,
"grad_norm": 0.30841004848480225,
"learning_rate": 7.242608081682653e-05,
"loss": 1.6543,
"step": 891
},
{
"epoch": 0.21540690654431296,
"grad_norm": 0.31386974453926086,
"learning_rate": 7.24079391037396e-05,
"loss": 1.787,
"step": 892
},
{
"epoch": 0.21564839410770345,
"grad_norm": 0.3163403868675232,
"learning_rate": 7.238977796717225e-05,
"loss": 1.6371,
"step": 893
},
{
"epoch": 0.21588988167109394,
"grad_norm": 0.32152020931243896,
"learning_rate": 7.237159741800923e-05,
"loss": 1.7653,
"step": 894
},
{
"epoch": 0.21613136923448442,
"grad_norm": 0.31930601596832275,
"learning_rate": 7.235339746714693e-05,
"loss": 1.7829,
"step": 895
},
{
"epoch": 0.2163728567978749,
"grad_norm": 0.2922723889350891,
"learning_rate": 7.233517812549334e-05,
"loss": 1.6658,
"step": 896
},
{
"epoch": 0.2166143443612654,
"grad_norm": 0.3041679561138153,
"learning_rate": 7.231693940396811e-05,
"loss": 1.7421,
"step": 897
},
{
"epoch": 0.21685583192465588,
"grad_norm": 0.30257824063301086,
"learning_rate": 7.229868131350254e-05,
"loss": 1.7724,
"step": 898
},
{
"epoch": 0.21709731948804636,
"grad_norm": 0.31659653782844543,
"learning_rate": 7.228040386503943e-05,
"loss": 1.7479,
"step": 899
},
{
"epoch": 0.21733880705143685,
"grad_norm": 0.31071823835372925,
"learning_rate": 7.22621070695333e-05,
"loss": 1.8688,
"step": 900
},
{
"epoch": 0.21758029461482734,
"grad_norm": 0.2893757224082947,
"learning_rate": 7.224379093795016e-05,
"loss": 1.6186,
"step": 901
},
{
"epoch": 0.21782178217821782,
"grad_norm": 0.2960699498653412,
"learning_rate": 7.22254554812677e-05,
"loss": 1.7204,
"step": 902
},
{
"epoch": 0.2180632697416083,
"grad_norm": 0.30816584825515747,
"learning_rate": 7.220710071047515e-05,
"loss": 1.8978,
"step": 903
},
{
"epoch": 0.2183047573049988,
"grad_norm": 0.32497209310531616,
"learning_rate": 7.21887266365733e-05,
"loss": 2.0118,
"step": 904
},
{
"epoch": 0.21854624486838928,
"grad_norm": 0.29622262716293335,
"learning_rate": 7.217033327057453e-05,
"loss": 1.6086,
"step": 905
},
{
"epoch": 0.21878773243177976,
"grad_norm": 0.2999928295612335,
"learning_rate": 7.215192062350279e-05,
"loss": 1.7762,
"step": 906
},
{
"epoch": 0.21902921999517025,
"grad_norm": 0.3181908428668976,
"learning_rate": 7.213348870639357e-05,
"loss": 1.7705,
"step": 907
},
{
"epoch": 0.21927070755856073,
"grad_norm": 0.3228391110897064,
"learning_rate": 7.211503753029392e-05,
"loss": 1.7532,
"step": 908
},
{
"epoch": 0.21951219512195122,
"grad_norm": 0.30232638120651245,
"learning_rate": 7.209656710626243e-05,
"loss": 1.7742,
"step": 909
},
{
"epoch": 0.2197536826853417,
"grad_norm": 0.30942896008491516,
"learning_rate": 7.207807744536922e-05,
"loss": 1.6523,
"step": 910
},
{
"epoch": 0.2199951702487322,
"grad_norm": 0.31623750925064087,
"learning_rate": 7.205956855869593e-05,
"loss": 1.6995,
"step": 911
},
{
"epoch": 0.22023665781212268,
"grad_norm": 0.29854124784469604,
"learning_rate": 7.204104045733576e-05,
"loss": 1.6602,
"step": 912
},
{
"epoch": 0.22047814537551316,
"grad_norm": 0.32152605056762695,
"learning_rate": 7.202249315239342e-05,
"loss": 1.857,
"step": 913
},
{
"epoch": 0.22071963293890365,
"grad_norm": 0.3507840633392334,
"learning_rate": 7.200392665498505e-05,
"loss": 1.9128,
"step": 914
},
{
"epoch": 0.22096112050229413,
"grad_norm": 0.30421963334083557,
"learning_rate": 7.198534097623841e-05,
"loss": 1.867,
"step": 915
},
{
"epoch": 0.22120260806568462,
"grad_norm": 0.31841713190078735,
"learning_rate": 7.196673612729268e-05,
"loss": 1.8014,
"step": 916
},
{
"epoch": 0.2214440956290751,
"grad_norm": 0.3139771521091461,
"learning_rate": 7.194811211929856e-05,
"loss": 1.8255,
"step": 917
},
{
"epoch": 0.2216855831924656,
"grad_norm": 0.31301793456077576,
"learning_rate": 7.19294689634182e-05,
"loss": 1.7579,
"step": 918
},
{
"epoch": 0.22192707075585608,
"grad_norm": 0.2959013283252716,
"learning_rate": 7.191080667082529e-05,
"loss": 1.5714,
"step": 919
},
{
"epoch": 0.22216855831924656,
"grad_norm": 0.3038369417190552,
"learning_rate": 7.189212525270492e-05,
"loss": 1.6526,
"step": 920
},
{
"epoch": 0.22241004588263705,
"grad_norm": 0.31139951944351196,
"learning_rate": 7.187342472025368e-05,
"loss": 1.8009,
"step": 921
},
{
"epoch": 0.22265153344602753,
"grad_norm": 0.3210260570049286,
"learning_rate": 7.185470508467963e-05,
"loss": 1.7528,
"step": 922
},
{
"epoch": 0.22289302100941802,
"grad_norm": 0.3127114474773407,
"learning_rate": 7.183596635720222e-05,
"loss": 1.8188,
"step": 923
},
{
"epoch": 0.2231345085728085,
"grad_norm": 0.29326117038726807,
"learning_rate": 7.18172085490524e-05,
"loss": 1.6762,
"step": 924
},
{
"epoch": 0.223375996136199,
"grad_norm": 0.30718374252319336,
"learning_rate": 7.179843167147253e-05,
"loss": 1.7206,
"step": 925
},
{
"epoch": 0.22361748369958948,
"grad_norm": 0.3027680218219757,
"learning_rate": 7.177963573571641e-05,
"loss": 1.6067,
"step": 926
},
{
"epoch": 0.22385897126297996,
"grad_norm": 0.30007830262184143,
"learning_rate": 7.176082075304924e-05,
"loss": 1.591,
"step": 927
},
{
"epoch": 0.22410045882637045,
"grad_norm": 0.31990012526512146,
"learning_rate": 7.17419867347477e-05,
"loss": 1.9032,
"step": 928
},
{
"epoch": 0.22434194638976093,
"grad_norm": 0.3229444622993469,
"learning_rate": 7.17231336920998e-05,
"loss": 1.6801,
"step": 929
},
{
"epoch": 0.22458343395315142,
"grad_norm": 0.3086046874523163,
"learning_rate": 7.170426163640497e-05,
"loss": 1.827,
"step": 930
},
{
"epoch": 0.2248249215165419,
"grad_norm": 0.32034003734588623,
"learning_rate": 7.168537057897407e-05,
"loss": 1.7706,
"step": 931
},
{
"epoch": 0.2250664090799324,
"grad_norm": 0.3041267991065979,
"learning_rate": 7.166646053112933e-05,
"loss": 1.771,
"step": 932
},
{
"epoch": 0.22530789664332287,
"grad_norm": 0.3302775025367737,
"learning_rate": 7.164753150420436e-05,
"loss": 1.7872,
"step": 933
},
{
"epoch": 0.22554938420671336,
"grad_norm": 0.3071431815624237,
"learning_rate": 7.162858350954412e-05,
"loss": 1.7244,
"step": 934
},
{
"epoch": 0.22579087177010385,
"grad_norm": 0.30182769894599915,
"learning_rate": 7.160961655850501e-05,
"loss": 1.6328,
"step": 935
},
{
"epoch": 0.22603235933349433,
"grad_norm": 0.30967414379119873,
"learning_rate": 7.159063066245471e-05,
"loss": 1.8115,
"step": 936
},
{
"epoch": 0.22627384689688482,
"grad_norm": 0.3253704905509949,
"learning_rate": 7.157162583277229e-05,
"loss": 1.7741,
"step": 937
},
{
"epoch": 0.2265153344602753,
"grad_norm": 0.30837851762771606,
"learning_rate": 7.155260208084817e-05,
"loss": 1.7762,
"step": 938
},
{
"epoch": 0.2267568220236658,
"grad_norm": 0.35172855854034424,
"learning_rate": 7.153355941808413e-05,
"loss": 2.0043,
"step": 939
},
{
"epoch": 0.22699830958705627,
"grad_norm": 0.35760238766670227,
"learning_rate": 7.151449785589324e-05,
"loss": 1.7604,
"step": 940
},
{
"epoch": 0.22723979715044676,
"grad_norm": 0.340904176235199,
"learning_rate": 7.149541740569991e-05,
"loss": 1.8142,
"step": 941
},
{
"epoch": 0.22748128471383725,
"grad_norm": 0.3355856239795685,
"learning_rate": 7.147631807893989e-05,
"loss": 1.8198,
"step": 942
},
{
"epoch": 0.22772277227722773,
"grad_norm": 0.3542833626270294,
"learning_rate": 7.145719988706024e-05,
"loss": 1.7095,
"step": 943
},
{
"epoch": 0.22796425984061822,
"grad_norm": 0.31368035078048706,
"learning_rate": 7.143806284151933e-05,
"loss": 1.7384,
"step": 944
},
{
"epoch": 0.2282057474040087,
"grad_norm": 0.3218083679676056,
"learning_rate": 7.141890695378678e-05,
"loss": 1.6452,
"step": 945
},
{
"epoch": 0.2284472349673992,
"grad_norm": 0.3157740533351898,
"learning_rate": 7.139973223534359e-05,
"loss": 1.7696,
"step": 946
},
{
"epoch": 0.22868872253078967,
"grad_norm": 0.32926589250564575,
"learning_rate": 7.138053869768196e-05,
"loss": 1.7798,
"step": 947
},
{
"epoch": 0.22893021009418016,
"grad_norm": 0.3095945417881012,
"learning_rate": 7.136132635230542e-05,
"loss": 1.8042,
"step": 948
},
{
"epoch": 0.22917169765757064,
"grad_norm": 0.30121171474456787,
"learning_rate": 7.134209521072878e-05,
"loss": 1.6287,
"step": 949
},
{
"epoch": 0.22941318522096113,
"grad_norm": 0.3294576406478882,
"learning_rate": 7.132284528447808e-05,
"loss": 1.8929,
"step": 950
},
{
"epoch": 0.22965467278435162,
"grad_norm": 0.3472389876842499,
"learning_rate": 7.130357658509062e-05,
"loss": 1.824,
"step": 951
},
{
"epoch": 0.2298961603477421,
"grad_norm": 0.32449764013290405,
"learning_rate": 7.128428912411498e-05,
"loss": 1.6925,
"step": 952
},
{
"epoch": 0.2301376479111326,
"grad_norm": 0.3121519386768341,
"learning_rate": 7.126498291311098e-05,
"loss": 1.7803,
"step": 953
},
{
"epoch": 0.23037913547452307,
"grad_norm": 0.3130584955215454,
"learning_rate": 7.124565796364964e-05,
"loss": 1.815,
"step": 954
},
{
"epoch": 0.23062062303791356,
"grad_norm": 0.3336242139339447,
"learning_rate": 7.122631428731327e-05,
"loss": 1.8314,
"step": 955
},
{
"epoch": 0.23086211060130404,
"grad_norm": 0.32837650179862976,
"learning_rate": 7.120695189569536e-05,
"loss": 1.8304,
"step": 956
},
{
"epoch": 0.23110359816469453,
"grad_norm": 0.3215225338935852,
"learning_rate": 7.11875708004006e-05,
"loss": 1.796,
"step": 957
},
{
"epoch": 0.23134508572808501,
"grad_norm": 0.3286936283111572,
"learning_rate": 7.116817101304497e-05,
"loss": 1.8722,
"step": 958
},
{
"epoch": 0.2315865732914755,
"grad_norm": 0.30622512102127075,
"learning_rate": 7.114875254525557e-05,
"loss": 1.7254,
"step": 959
},
{
"epoch": 0.23182806085486599,
"grad_norm": 0.3257673382759094,
"learning_rate": 7.112931540867074e-05,
"loss": 1.7707,
"step": 960
},
{
"epoch": 0.23206954841825647,
"grad_norm": 0.3099058270454407,
"learning_rate": 7.110985961494e-05,
"loss": 1.7187,
"step": 961
},
{
"epoch": 0.23231103598164696,
"grad_norm": 0.2989310324192047,
"learning_rate": 7.109038517572401e-05,
"loss": 1.7216,
"step": 962
},
{
"epoch": 0.23255252354503744,
"grad_norm": 0.2901287376880646,
"learning_rate": 7.107089210269472e-05,
"loss": 1.5476,
"step": 963
},
{
"epoch": 0.23279401110842793,
"grad_norm": 0.31841766834259033,
"learning_rate": 7.10513804075351e-05,
"loss": 1.7271,
"step": 964
},
{
"epoch": 0.23303549867181841,
"grad_norm": 0.3112894892692566,
"learning_rate": 7.103185010193938e-05,
"loss": 1.8632,
"step": 965
},
{
"epoch": 0.2332769862352089,
"grad_norm": 0.29903125762939453,
"learning_rate": 7.101230119761294e-05,
"loss": 1.5865,
"step": 966
},
{
"epoch": 0.23351847379859939,
"grad_norm": 0.34164854884147644,
"learning_rate": 7.099273370627225e-05,
"loss": 1.8468,
"step": 967
},
{
"epoch": 0.23375996136198987,
"grad_norm": 0.3156038522720337,
"learning_rate": 7.097314763964496e-05,
"loss": 1.8972,
"step": 968
},
{
"epoch": 0.23400144892538036,
"grad_norm": 0.3213566541671753,
"learning_rate": 7.095354300946988e-05,
"loss": 1.7789,
"step": 969
},
{
"epoch": 0.23424293648877084,
"grad_norm": 0.31230372190475464,
"learning_rate": 7.093391982749686e-05,
"loss": 1.8018,
"step": 970
},
{
"epoch": 0.2344844240521613,
"grad_norm": 0.2937510907649994,
"learning_rate": 7.091427810548698e-05,
"loss": 1.6656,
"step": 971
},
{
"epoch": 0.23472591161555179,
"grad_norm": 0.3283037841320038,
"learning_rate": 7.089461785521232e-05,
"loss": 1.9303,
"step": 972
},
{
"epoch": 0.23496739917894227,
"grad_norm": 0.29008540511131287,
"learning_rate": 7.087493908845617e-05,
"loss": 1.6371,
"step": 973
},
{
"epoch": 0.23520888674233276,
"grad_norm": 0.29574844241142273,
"learning_rate": 7.085524181701281e-05,
"loss": 1.6921,
"step": 974
},
{
"epoch": 0.23545037430572324,
"grad_norm": 0.30947405099868774,
"learning_rate": 7.083552605268772e-05,
"loss": 1.7036,
"step": 975
},
{
"epoch": 0.23569186186911373,
"grad_norm": 0.30380678176879883,
"learning_rate": 7.081579180729739e-05,
"loss": 1.7498,
"step": 976
},
{
"epoch": 0.2359333494325042,
"grad_norm": 0.30693385004997253,
"learning_rate": 7.079603909266939e-05,
"loss": 1.6627,
"step": 977
},
{
"epoch": 0.2361748369958947,
"grad_norm": 0.31431153416633606,
"learning_rate": 7.07762679206424e-05,
"loss": 1.7361,
"step": 978
},
{
"epoch": 0.23641632455928518,
"grad_norm": 0.3664765954017639,
"learning_rate": 7.075647830306614e-05,
"loss": 2.0544,
"step": 979
},
{
"epoch": 0.23665781212267567,
"grad_norm": 0.29501873254776,
"learning_rate": 7.073667025180136e-05,
"loss": 1.6702,
"step": 980
},
{
"epoch": 0.23689929968606616,
"grad_norm": 0.3374174237251282,
"learning_rate": 7.07168437787199e-05,
"loss": 1.7518,
"step": 981
},
{
"epoch": 0.23714078724945664,
"grad_norm": 0.30214452743530273,
"learning_rate": 7.069699889570464e-05,
"loss": 1.7077,
"step": 982
},
{
"epoch": 0.23738227481284713,
"grad_norm": 0.32760128378868103,
"learning_rate": 7.067713561464943e-05,
"loss": 1.7956,
"step": 983
},
{
"epoch": 0.2376237623762376,
"grad_norm": 0.32891011238098145,
"learning_rate": 7.065725394745925e-05,
"loss": 1.7961,
"step": 984
},
{
"epoch": 0.2378652499396281,
"grad_norm": 0.316244900226593,
"learning_rate": 7.063735390605001e-05,
"loss": 1.7438,
"step": 985
},
{
"epoch": 0.23810673750301858,
"grad_norm": 0.3115881085395813,
"learning_rate": 7.061743550234867e-05,
"loss": 1.7114,
"step": 986
},
{
"epoch": 0.23834822506640907,
"grad_norm": 0.30240875482559204,
"learning_rate": 7.05974987482932e-05,
"loss": 1.9043,
"step": 987
},
{
"epoch": 0.23858971262979956,
"grad_norm": 0.29856961965560913,
"learning_rate": 7.057754365583252e-05,
"loss": 1.6706,
"step": 988
},
{
"epoch": 0.23883120019319004,
"grad_norm": 0.3027576804161072,
"learning_rate": 7.055757023692664e-05,
"loss": 1.6477,
"step": 989
},
{
"epoch": 0.23907268775658053,
"grad_norm": 0.29951512813568115,
"learning_rate": 7.053757850354646e-05,
"loss": 1.7039,
"step": 990
},
{
"epoch": 0.239314175319971,
"grad_norm": 0.3174339532852173,
"learning_rate": 7.051756846767392e-05,
"loss": 1.7394,
"step": 991
},
{
"epoch": 0.2395556628833615,
"grad_norm": 0.36309531331062317,
"learning_rate": 7.049754014130186e-05,
"loss": 1.7498,
"step": 992
},
{
"epoch": 0.23979715044675198,
"grad_norm": 0.32905271649360657,
"learning_rate": 7.047749353643416e-05,
"loss": 1.93,
"step": 993
},
{
"epoch": 0.24003863801014247,
"grad_norm": 0.33373570442199707,
"learning_rate": 7.045742866508557e-05,
"loss": 1.9002,
"step": 994
},
{
"epoch": 0.24028012557353295,
"grad_norm": 0.29678893089294434,
"learning_rate": 7.043734553928188e-05,
"loss": 1.7453,
"step": 995
},
{
"epoch": 0.24052161313692344,
"grad_norm": 0.3152943253517151,
"learning_rate": 7.041724417105977e-05,
"loss": 1.9029,
"step": 996
},
{
"epoch": 0.24076310070031393,
"grad_norm": 0.3238842785358429,
"learning_rate": 7.039712457246685e-05,
"loss": 1.8002,
"step": 997
},
{
"epoch": 0.2410045882637044,
"grad_norm": 0.3086811304092407,
"learning_rate": 7.037698675556167e-05,
"loss": 1.7737,
"step": 998
},
{
"epoch": 0.2412460758270949,
"grad_norm": 0.3081672191619873,
"learning_rate": 7.03568307324137e-05,
"loss": 1.7324,
"step": 999
},
{
"epoch": 0.24148756339048538,
"grad_norm": 0.31431376934051514,
"learning_rate": 7.03366565151033e-05,
"loss": 1.9154,
"step": 1000
},
{
"epoch": 0.24172905095387587,
"grad_norm": 0.3176340162754059,
"learning_rate": 7.031646411572175e-05,
"loss": 1.8225,
"step": 1001
},
{
"epoch": 0.24197053851726635,
"grad_norm": 0.30996355414390564,
"learning_rate": 7.029625354637126e-05,
"loss": 1.7401,
"step": 1002
},
{
"epoch": 0.24221202608065684,
"grad_norm": 0.30828937888145447,
"learning_rate": 7.027602481916487e-05,
"loss": 1.7273,
"step": 1003
},
{
"epoch": 0.24245351364404732,
"grad_norm": 0.31076958775520325,
"learning_rate": 7.025577794622655e-05,
"loss": 1.7303,
"step": 1004
},
{
"epoch": 0.2426950012074378,
"grad_norm": 0.29473546147346497,
"learning_rate": 7.023551293969111e-05,
"loss": 1.6771,
"step": 1005
},
{
"epoch": 0.2429364887708283,
"grad_norm": 0.3021571636199951,
"learning_rate": 7.021522981170426e-05,
"loss": 1.6781,
"step": 1006
},
{
"epoch": 0.24317797633421878,
"grad_norm": 0.3115958273410797,
"learning_rate": 7.019492857442254e-05,
"loss": 1.7734,
"step": 1007
},
{
"epoch": 0.24341946389760927,
"grad_norm": 0.31508898735046387,
"learning_rate": 7.017460924001337e-05,
"loss": 1.8933,
"step": 1008
},
{
"epoch": 0.24366095146099975,
"grad_norm": 0.30906936526298523,
"learning_rate": 7.015427182065502e-05,
"loss": 1.7643,
"step": 1009
},
{
"epoch": 0.24390243902439024,
"grad_norm": 0.3254733681678772,
"learning_rate": 7.013391632853658e-05,
"loss": 1.877,
"step": 1010
},
{
"epoch": 0.24414392658778072,
"grad_norm": 0.30368247628211975,
"learning_rate": 7.011354277585796e-05,
"loss": 1.7064,
"step": 1011
},
{
"epoch": 0.2443854141511712,
"grad_norm": 0.312253475189209,
"learning_rate": 7.009315117482992e-05,
"loss": 1.7001,
"step": 1012
},
{
"epoch": 0.2446269017145617,
"grad_norm": 0.3097797930240631,
"learning_rate": 7.007274153767401e-05,
"loss": 1.7155,
"step": 1013
},
{
"epoch": 0.24486838927795218,
"grad_norm": 0.32299181818962097,
"learning_rate": 7.005231387662266e-05,
"loss": 1.75,
"step": 1014
},
{
"epoch": 0.24510987684134267,
"grad_norm": 0.3350425064563751,
"learning_rate": 7.003186820391902e-05,
"loss": 1.9598,
"step": 1015
},
{
"epoch": 0.24535136440473315,
"grad_norm": 0.31722837686538696,
"learning_rate": 7.001140453181705e-05,
"loss": 1.7972,
"step": 1016
},
{
"epoch": 0.24559285196812364,
"grad_norm": 0.33487066626548767,
"learning_rate": 6.999092287258155e-05,
"loss": 1.7209,
"step": 1017
},
{
"epoch": 0.24583433953151412,
"grad_norm": 0.299215167760849,
"learning_rate": 6.997042323848803e-05,
"loss": 1.7777,
"step": 1018
},
{
"epoch": 0.2460758270949046,
"grad_norm": 0.31593263149261475,
"learning_rate": 6.994990564182284e-05,
"loss": 1.9017,
"step": 1019
},
{
"epoch": 0.2463173146582951,
"grad_norm": 0.33589228987693787,
"learning_rate": 6.992937009488303e-05,
"loss": 1.7828,
"step": 1020
},
{
"epoch": 0.24655880222168558,
"grad_norm": 0.31036049127578735,
"learning_rate": 6.990881660997647e-05,
"loss": 1.6732,
"step": 1021
},
{
"epoch": 0.24680028978507607,
"grad_norm": 0.31886163353919983,
"learning_rate": 6.988824519942174e-05,
"loss": 1.745,
"step": 1022
},
{
"epoch": 0.24704177734846655,
"grad_norm": 0.3151240944862366,
"learning_rate": 6.986765587554818e-05,
"loss": 1.6845,
"step": 1023
},
{
"epoch": 0.24728326491185704,
"grad_norm": 0.33134594559669495,
"learning_rate": 6.984704865069587e-05,
"loss": 1.8795,
"step": 1024
},
{
"epoch": 0.24752475247524752,
"grad_norm": 0.3216524124145508,
"learning_rate": 6.98264235372156e-05,
"loss": 1.8034,
"step": 1025
},
{
"epoch": 0.247766240038638,
"grad_norm": 0.3143945336341858,
"learning_rate": 6.98057805474689e-05,
"loss": 1.7907,
"step": 1026
},
{
"epoch": 0.2480077276020285,
"grad_norm": 0.2938880920410156,
"learning_rate": 6.978511969382799e-05,
"loss": 1.6928,
"step": 1027
},
{
"epoch": 0.24824921516541898,
"grad_norm": 0.3236706852912903,
"learning_rate": 6.976444098867584e-05,
"loss": 1.8266,
"step": 1028
},
{
"epoch": 0.24849070272880946,
"grad_norm": 0.3211507499217987,
"learning_rate": 6.974374444440608e-05,
"loss": 1.6146,
"step": 1029
},
{
"epoch": 0.24873219029219995,
"grad_norm": 0.32520592212677,
"learning_rate": 6.972303007342304e-05,
"loss": 1.8695,
"step": 1030
},
{
"epoch": 0.24897367785559044,
"grad_norm": 0.3441825211048126,
"learning_rate": 6.970229788814176e-05,
"loss": 1.8257,
"step": 1031
},
{
"epoch": 0.24921516541898092,
"grad_norm": 0.31615981459617615,
"learning_rate": 6.968154790098791e-05,
"loss": 1.742,
"step": 1032
},
{
"epoch": 0.2494566529823714,
"grad_norm": 0.32662391662597656,
"learning_rate": 6.966078012439787e-05,
"loss": 1.7395,
"step": 1033
},
{
"epoch": 0.2496981405457619,
"grad_norm": 0.3284902572631836,
"learning_rate": 6.963999457081865e-05,
"loss": 1.9117,
"step": 1034
},
{
"epoch": 0.24993962810915238,
"grad_norm": 0.3873670697212219,
"learning_rate": 6.961919125270795e-05,
"loss": 1.9818,
"step": 1035
},
{
"epoch": 0.25018111567254286,
"grad_norm": 0.30828049778938293,
"learning_rate": 6.95983701825341e-05,
"loss": 1.7419,
"step": 1036
},
{
"epoch": 0.25042260323593335,
"grad_norm": 0.33050593733787537,
"learning_rate": 6.957753137277606e-05,
"loss": 1.8804,
"step": 1037
},
{
"epoch": 0.25066409079932384,
"grad_norm": 0.32716748118400574,
"learning_rate": 6.955667483592344e-05,
"loss": 1.7466,
"step": 1038
},
{
"epoch": 0.2509055783627143,
"grad_norm": 0.31101343035697937,
"learning_rate": 6.953580058447644e-05,
"loss": 1.6372,
"step": 1039
},
{
"epoch": 0.2511470659261048,
"grad_norm": 0.3111160397529602,
"learning_rate": 6.951490863094593e-05,
"loss": 1.7179,
"step": 1040
},
{
"epoch": 0.2513885534894953,
"grad_norm": 0.29902443289756775,
"learning_rate": 6.949399898785336e-05,
"loss": 1.6466,
"step": 1041
},
{
"epoch": 0.2516300410528858,
"grad_norm": 0.3357815146446228,
"learning_rate": 6.947307166773077e-05,
"loss": 1.8709,
"step": 1042
},
{
"epoch": 0.25187152861627626,
"grad_norm": 0.31973618268966675,
"learning_rate": 6.945212668312082e-05,
"loss": 1.6442,
"step": 1043
},
{
"epoch": 0.25211301617966675,
"grad_norm": 0.31211501359939575,
"learning_rate": 6.943116404657673e-05,
"loss": 1.6384,
"step": 1044
},
{
"epoch": 0.25235450374305723,
"grad_norm": 0.31134694814682007,
"learning_rate": 6.941018377066233e-05,
"loss": 1.7111,
"step": 1045
},
{
"epoch": 0.2525959913064477,
"grad_norm": 0.33072417974472046,
"learning_rate": 6.9389185867952e-05,
"loss": 1.795,
"step": 1046
},
{
"epoch": 0.2528374788698382,
"grad_norm": 0.31047114729881287,
"learning_rate": 6.93681703510307e-05,
"loss": 1.7906,
"step": 1047
},
{
"epoch": 0.2530789664332287,
"grad_norm": 0.3172812759876251,
"learning_rate": 6.934713723249394e-05,
"loss": 1.7707,
"step": 1048
},
{
"epoch": 0.2533204539966192,
"grad_norm": 0.3336034417152405,
"learning_rate": 6.932608652494775e-05,
"loss": 1.8711,
"step": 1049
},
{
"epoch": 0.25356194156000966,
"grad_norm": 0.30605006217956543,
"learning_rate": 6.930501824100876e-05,
"loss": 1.7119,
"step": 1050
},
{
"epoch": 0.25380342912340015,
"grad_norm": 0.31676533818244934,
"learning_rate": 6.92839323933041e-05,
"loss": 1.7777,
"step": 1051
},
{
"epoch": 0.25404491668679063,
"grad_norm": 0.3026861548423767,
"learning_rate": 6.926282899447145e-05,
"loss": 1.5476,
"step": 1052
},
{
"epoch": 0.2542864042501811,
"grad_norm": 0.3109389841556549,
"learning_rate": 6.924170805715894e-05,
"loss": 1.6907,
"step": 1053
},
{
"epoch": 0.2545278918135716,
"grad_norm": 0.31733840703964233,
"learning_rate": 6.922056959402528e-05,
"loss": 1.8424,
"step": 1054
},
{
"epoch": 0.2547693793769621,
"grad_norm": 0.3075104355812073,
"learning_rate": 6.919941361773971e-05,
"loss": 1.7506,
"step": 1055
},
{
"epoch": 0.2550108669403526,
"grad_norm": 0.30655089020729065,
"learning_rate": 6.917824014098187e-05,
"loss": 1.7237,
"step": 1056
},
{
"epoch": 0.25525235450374306,
"grad_norm": 0.3038625419139862,
"learning_rate": 6.915704917644196e-05,
"loss": 1.7619,
"step": 1057
},
{
"epoch": 0.25549384206713355,
"grad_norm": 0.3069184720516205,
"learning_rate": 6.913584073682062e-05,
"loss": 1.7937,
"step": 1058
},
{
"epoch": 0.25573532963052403,
"grad_norm": 0.3129485845565796,
"learning_rate": 6.911461483482903e-05,
"loss": 1.7742,
"step": 1059
},
{
"epoch": 0.2559768171939145,
"grad_norm": 0.3241616487503052,
"learning_rate": 6.909337148318877e-05,
"loss": 1.8027,
"step": 1060
},
{
"epoch": 0.256218304757305,
"grad_norm": 0.31016314029693604,
"learning_rate": 6.907211069463189e-05,
"loss": 1.6762,
"step": 1061
},
{
"epoch": 0.2564597923206955,
"grad_norm": 0.31560423970222473,
"learning_rate": 6.90508324819009e-05,
"loss": 1.7494,
"step": 1062
},
{
"epoch": 0.256701279884086,
"grad_norm": 0.3282552659511566,
"learning_rate": 6.902953685774877e-05,
"loss": 1.7234,
"step": 1063
},
{
"epoch": 0.25694276744747646,
"grad_norm": 0.2940976321697235,
"learning_rate": 6.900822383493888e-05,
"loss": 1.6625,
"step": 1064
},
{
"epoch": 0.25718425501086695,
"grad_norm": 0.3287374973297119,
"learning_rate": 6.898689342624505e-05,
"loss": 1.7004,
"step": 1065
},
{
"epoch": 0.25742574257425743,
"grad_norm": 0.2989017367362976,
"learning_rate": 6.896554564445151e-05,
"loss": 1.7968,
"step": 1066
},
{
"epoch": 0.2576672301376479,
"grad_norm": 0.3247399628162384,
"learning_rate": 6.894418050235291e-05,
"loss": 1.8534,
"step": 1067
},
{
"epoch": 0.2579087177010384,
"grad_norm": 0.31024983525276184,
"learning_rate": 6.892279801275434e-05,
"loss": 1.7237,
"step": 1068
},
{
"epoch": 0.2581502052644289,
"grad_norm": 0.30201882123947144,
"learning_rate": 6.890139818847119e-05,
"loss": 1.6479,
"step": 1069
},
{
"epoch": 0.2583916928278194,
"grad_norm": 0.30573418736457825,
"learning_rate": 6.887998104232934e-05,
"loss": 1.7856,
"step": 1070
},
{
"epoch": 0.25863318039120986,
"grad_norm": 0.32034432888031006,
"learning_rate": 6.885854658716501e-05,
"loss": 1.8321,
"step": 1071
},
{
"epoch": 0.25887466795460035,
"grad_norm": 0.30810311436653137,
"learning_rate": 6.883709483582479e-05,
"loss": 1.761,
"step": 1072
},
{
"epoch": 0.25911615551799083,
"grad_norm": 0.3177671730518341,
"learning_rate": 6.881562580116563e-05,
"loss": 1.9071,
"step": 1073
},
{
"epoch": 0.2593576430813813,
"grad_norm": 0.3227800130844116,
"learning_rate": 6.879413949605488e-05,
"loss": 1.8626,
"step": 1074
},
{
"epoch": 0.2595991306447718,
"grad_norm": 0.31385767459869385,
"learning_rate": 6.877263593337018e-05,
"loss": 1.7978,
"step": 1075
},
{
"epoch": 0.2598406182081623,
"grad_norm": 0.3195452094078064,
"learning_rate": 6.875111512599959e-05,
"loss": 1.7311,
"step": 1076
},
{
"epoch": 0.2600821057715528,
"grad_norm": 0.30203190445899963,
"learning_rate": 6.87295770868414e-05,
"loss": 1.8604,
"step": 1077
},
{
"epoch": 0.26032359333494326,
"grad_norm": 0.31031322479248047,
"learning_rate": 6.870802182880436e-05,
"loss": 1.7341,
"step": 1078
},
{
"epoch": 0.26056508089833375,
"grad_norm": 0.3019157946109772,
"learning_rate": 6.868644936480741e-05,
"loss": 1.7871,
"step": 1079
},
{
"epoch": 0.26080656846172423,
"grad_norm": 0.30810457468032837,
"learning_rate": 6.866485970777988e-05,
"loss": 1.6875,
"step": 1080
},
{
"epoch": 0.2610480560251147,
"grad_norm": 0.2972477078437805,
"learning_rate": 6.864325287066141e-05,
"loss": 1.7081,
"step": 1081
},
{
"epoch": 0.2612895435885052,
"grad_norm": 0.30459290742874146,
"learning_rate": 6.862162886640187e-05,
"loss": 1.7533,
"step": 1082
},
{
"epoch": 0.2615310311518957,
"grad_norm": 0.31380796432495117,
"learning_rate": 6.85999877079615e-05,
"loss": 1.7714,
"step": 1083
},
{
"epoch": 0.2617725187152862,
"grad_norm": 0.33258458971977234,
"learning_rate": 6.857832940831076e-05,
"loss": 1.7071,
"step": 1084
},
{
"epoch": 0.26201400627867666,
"grad_norm": 0.3180256485939026,
"learning_rate": 6.855665398043041e-05,
"loss": 1.7715,
"step": 1085
},
{
"epoch": 0.26225549384206714,
"grad_norm": 0.30172500014305115,
"learning_rate": 6.853496143731148e-05,
"loss": 1.66,
"step": 1086
},
{
"epoch": 0.26249698140545763,
"grad_norm": 0.3087107837200165,
"learning_rate": 6.851325179195525e-05,
"loss": 1.8475,
"step": 1087
},
{
"epoch": 0.2627384689688481,
"grad_norm": 0.30850815773010254,
"learning_rate": 6.849152505737324e-05,
"loss": 1.6628,
"step": 1088
},
{
"epoch": 0.2629799565322386,
"grad_norm": 0.32561859488487244,
"learning_rate": 6.846978124658721e-05,
"loss": 1.8223,
"step": 1089
},
{
"epoch": 0.2632214440956291,
"grad_norm": 0.31419193744659424,
"learning_rate": 6.84480203726292e-05,
"loss": 1.7969,
"step": 1090
},
{
"epoch": 0.2634629316590196,
"grad_norm": 0.2968290448188782,
"learning_rate": 6.842624244854143e-05,
"loss": 1.6429,
"step": 1091
},
{
"epoch": 0.26370441922241006,
"grad_norm": 0.29795342683792114,
"learning_rate": 6.840444748737634e-05,
"loss": 1.6196,
"step": 1092
},
{
"epoch": 0.26394590678580054,
"grad_norm": 0.3083789646625519,
"learning_rate": 6.838263550219661e-05,
"loss": 1.6487,
"step": 1093
},
{
"epoch": 0.26418739434919103,
"grad_norm": 0.3154310882091522,
"learning_rate": 6.83608065060751e-05,
"loss": 1.7913,
"step": 1094
},
{
"epoch": 0.2644288819125815,
"grad_norm": 0.31129124760627747,
"learning_rate": 6.833896051209488e-05,
"loss": 1.8084,
"step": 1095
},
{
"epoch": 0.264670369475972,
"grad_norm": 0.3194768726825714,
"learning_rate": 6.831709753334917e-05,
"loss": 1.7722,
"step": 1096
},
{
"epoch": 0.2649118570393625,
"grad_norm": 0.30591970682144165,
"learning_rate": 6.829521758294145e-05,
"loss": 1.6323,
"step": 1097
},
{
"epoch": 0.26515334460275297,
"grad_norm": 0.34531369805336,
"learning_rate": 6.827332067398527e-05,
"loss": 1.9616,
"step": 1098
},
{
"epoch": 0.26539483216614346,
"grad_norm": 0.3163597881793976,
"learning_rate": 6.825140681960442e-05,
"loss": 1.8197,
"step": 1099
},
{
"epoch": 0.26563631972953394,
"grad_norm": 0.2946029603481293,
"learning_rate": 6.822947603293281e-05,
"loss": 1.5898,
"step": 1100
},
{
"epoch": 0.26587780729292443,
"grad_norm": 0.30496641993522644,
"learning_rate": 6.820752832711453e-05,
"loss": 1.5909,
"step": 1101
},
{
"epoch": 0.2661192948563149,
"grad_norm": 0.33542415499687195,
"learning_rate": 6.818556371530378e-05,
"loss": 1.781,
"step": 1102
},
{
"epoch": 0.2663607824197054,
"grad_norm": 0.29307273030281067,
"learning_rate": 6.81635822106649e-05,
"loss": 1.6314,
"step": 1103
},
{
"epoch": 0.2666022699830959,
"grad_norm": 0.3387092053890228,
"learning_rate": 6.814158382637235e-05,
"loss": 1.8468,
"step": 1104
},
{
"epoch": 0.26684375754648637,
"grad_norm": 0.31816431879997253,
"learning_rate": 6.811956857561074e-05,
"loss": 1.6737,
"step": 1105
},
{
"epoch": 0.26708524510987686,
"grad_norm": 0.316036581993103,
"learning_rate": 6.809753647157472e-05,
"loss": 1.7484,
"step": 1106
},
{
"epoch": 0.26732673267326734,
"grad_norm": 0.33369365334510803,
"learning_rate": 6.807548752746911e-05,
"loss": 1.8675,
"step": 1107
},
{
"epoch": 0.2675682202366578,
"grad_norm": 0.31673648953437805,
"learning_rate": 6.805342175650881e-05,
"loss": 1.7556,
"step": 1108
},
{
"epoch": 0.2678097078000483,
"grad_norm": 0.3155718147754669,
"learning_rate": 6.803133917191878e-05,
"loss": 1.6849,
"step": 1109
},
{
"epoch": 0.2680511953634388,
"grad_norm": 0.31575703620910645,
"learning_rate": 6.800923978693403e-05,
"loss": 1.7253,
"step": 1110
},
{
"epoch": 0.2682926829268293,
"grad_norm": 0.2937328815460205,
"learning_rate": 6.798712361479974e-05,
"loss": 1.6773,
"step": 1111
},
{
"epoch": 0.26853417049021977,
"grad_norm": 0.30649858713150024,
"learning_rate": 6.796499066877106e-05,
"loss": 1.7406,
"step": 1112
},
{
"epoch": 0.26877565805361026,
"grad_norm": 0.34489336609840393,
"learning_rate": 6.79428409621132e-05,
"loss": 1.6196,
"step": 1113
},
{
"epoch": 0.26901714561700074,
"grad_norm": 0.29834282398223877,
"learning_rate": 6.792067450810149e-05,
"loss": 1.6866,
"step": 1114
},
{
"epoch": 0.2692586331803912,
"grad_norm": 0.3076286315917969,
"learning_rate": 6.78984913200212e-05,
"loss": 1.8003,
"step": 1115
},
{
"epoch": 0.2695001207437817,
"grad_norm": 0.3159911632537842,
"learning_rate": 6.78762914111677e-05,
"loss": 1.766,
"step": 1116
},
{
"epoch": 0.2697416083071722,
"grad_norm": 0.3220473527908325,
"learning_rate": 6.785407479484633e-05,
"loss": 1.8153,
"step": 1117
},
{
"epoch": 0.2699830958705627,
"grad_norm": 0.31992602348327637,
"learning_rate": 6.78318414843725e-05,
"loss": 1.9029,
"step": 1118
},
{
"epoch": 0.27022458343395317,
"grad_norm": 0.31522971391677856,
"learning_rate": 6.780959149307156e-05,
"loss": 1.7615,
"step": 1119
},
{
"epoch": 0.27046607099734365,
"grad_norm": 0.3177500069141388,
"learning_rate": 6.778732483427895e-05,
"loss": 1.8575,
"step": 1120
},
{
"epoch": 0.27070755856073414,
"grad_norm": 0.3230820596218109,
"learning_rate": 6.776504152134e-05,
"loss": 1.5958,
"step": 1121
},
{
"epoch": 0.2709490461241246,
"grad_norm": 0.30711400508880615,
"learning_rate": 6.774274156761004e-05,
"loss": 1.6727,
"step": 1122
},
{
"epoch": 0.2711905336875151,
"grad_norm": 0.3092270791530609,
"learning_rate": 6.772042498645446e-05,
"loss": 1.7553,
"step": 1123
},
{
"epoch": 0.2714320212509056,
"grad_norm": 0.3213646411895752,
"learning_rate": 6.769809179124851e-05,
"loss": 1.7792,
"step": 1124
},
{
"epoch": 0.2716735088142961,
"grad_norm": 0.3240359127521515,
"learning_rate": 6.767574199537744e-05,
"loss": 1.6274,
"step": 1125
},
{
"epoch": 0.27191499637768657,
"grad_norm": 0.2978704869747162,
"learning_rate": 6.765337561223647e-05,
"loss": 1.6713,
"step": 1126
},
{
"epoch": 0.27215648394107705,
"grad_norm": 0.3193483054637909,
"learning_rate": 6.763099265523073e-05,
"loss": 1.8093,
"step": 1127
},
{
"epoch": 0.27239797150446754,
"grad_norm": 0.31814515590667725,
"learning_rate": 6.760859313777531e-05,
"loss": 1.8676,
"step": 1128
},
{
"epoch": 0.272639459067858,
"grad_norm": 0.4061098098754883,
"learning_rate": 6.758617707329517e-05,
"loss": 1.9431,
"step": 1129
},
{
"epoch": 0.2728809466312485,
"grad_norm": 0.2891682982444763,
"learning_rate": 6.756374447522527e-05,
"loss": 1.6673,
"step": 1130
},
{
"epoch": 0.273122434194639,
"grad_norm": 0.33421790599823,
"learning_rate": 6.754129535701044e-05,
"loss": 1.8145,
"step": 1131
},
{
"epoch": 0.2733639217580295,
"grad_norm": 0.3227134346961975,
"learning_rate": 6.751882973210537e-05,
"loss": 1.7578,
"step": 1132
},
{
"epoch": 0.27360540932141997,
"grad_norm": 0.31454476714134216,
"learning_rate": 6.74963476139747e-05,
"loss": 1.7671,
"step": 1133
},
{
"epoch": 0.27384689688481045,
"grad_norm": 0.317026287317276,
"learning_rate": 6.747384901609294e-05,
"loss": 1.7762,
"step": 1134
},
{
"epoch": 0.27408838444820094,
"grad_norm": 0.2962716817855835,
"learning_rate": 6.745133395194447e-05,
"loss": 1.5824,
"step": 1135
},
{
"epoch": 0.2743298720115914,
"grad_norm": 0.30856993794441223,
"learning_rate": 6.742880243502354e-05,
"loss": 1.7647,
"step": 1136
},
{
"epoch": 0.2745713595749819,
"grad_norm": 0.319867879152298,
"learning_rate": 6.740625447883428e-05,
"loss": 1.6957,
"step": 1137
},
{
"epoch": 0.2748128471383724,
"grad_norm": 0.3080596625804901,
"learning_rate": 6.738369009689064e-05,
"loss": 1.7083,
"step": 1138
},
{
"epoch": 0.2750543347017629,
"grad_norm": 0.3023444712162018,
"learning_rate": 6.736110930271642e-05,
"loss": 1.6312,
"step": 1139
},
{
"epoch": 0.27529582226515337,
"grad_norm": 0.2923286259174347,
"learning_rate": 6.733851210984529e-05,
"loss": 1.6025,
"step": 1140
},
{
"epoch": 0.27553730982854385,
"grad_norm": 0.32204321026802063,
"learning_rate": 6.731589853182071e-05,
"loss": 1.6971,
"step": 1141
},
{
"epoch": 0.27577879739193434,
"grad_norm": 0.3155701160430908,
"learning_rate": 6.729326858219599e-05,
"loss": 1.801,
"step": 1142
},
{
"epoch": 0.2760202849553248,
"grad_norm": 0.30704465508461,
"learning_rate": 6.727062227453423e-05,
"loss": 1.8037,
"step": 1143
},
{
"epoch": 0.2762617725187153,
"grad_norm": 0.3244346082210541,
"learning_rate": 6.724795962240834e-05,
"loss": 1.7041,
"step": 1144
},
{
"epoch": 0.2765032600821058,
"grad_norm": 0.3154827952384949,
"learning_rate": 6.722528063940102e-05,
"loss": 1.8805,
"step": 1145
},
{
"epoch": 0.2767447476454963,
"grad_norm": 0.30870237946510315,
"learning_rate": 6.720258533910478e-05,
"loss": 1.6691,
"step": 1146
},
{
"epoch": 0.27698623520888677,
"grad_norm": 0.31408193707466125,
"learning_rate": 6.71798737351219e-05,
"loss": 1.5256,
"step": 1147
},
{
"epoch": 0.27722772277227725,
"grad_norm": 0.30898284912109375,
"learning_rate": 6.71571458410644e-05,
"loss": 1.7632,
"step": 1148
},
{
"epoch": 0.27746921033566774,
"grad_norm": 0.33258867263793945,
"learning_rate": 6.713440167055414e-05,
"loss": 1.9591,
"step": 1149
},
{
"epoch": 0.2777106978990582,
"grad_norm": 0.3209589421749115,
"learning_rate": 6.711164123722264e-05,
"loss": 1.8063,
"step": 1150
},
{
"epoch": 0.2779521854624487,
"grad_norm": 0.2860225439071655,
"learning_rate": 6.708886455471122e-05,
"loss": 1.6478,
"step": 1151
},
{
"epoch": 0.2781936730258392,
"grad_norm": 0.3342827558517456,
"learning_rate": 6.706607163667094e-05,
"loss": 1.7519,
"step": 1152
},
{
"epoch": 0.2784351605892297,
"grad_norm": 0.3273380398750305,
"learning_rate": 6.704326249676261e-05,
"loss": 1.7519,
"step": 1153
},
{
"epoch": 0.27867664815262017,
"grad_norm": 0.30446043610572815,
"learning_rate": 6.702043714865668e-05,
"loss": 1.8829,
"step": 1154
},
{
"epoch": 0.27891813571601065,
"grad_norm": 0.327215313911438,
"learning_rate": 6.69975956060334e-05,
"loss": 1.8613,
"step": 1155
},
{
"epoch": 0.27915962327940114,
"grad_norm": 0.3009372353553772,
"learning_rate": 6.697473788258269e-05,
"loss": 1.7337,
"step": 1156
},
{
"epoch": 0.2794011108427916,
"grad_norm": 0.2950308620929718,
"learning_rate": 6.695186399200416e-05,
"loss": 1.6854,
"step": 1157
},
{
"epoch": 0.2796425984061821,
"grad_norm": 0.3036794364452362,
"learning_rate": 6.692897394800716e-05,
"loss": 1.5876,
"step": 1158
},
{
"epoch": 0.2798840859695726,
"grad_norm": 0.34749823808670044,
"learning_rate": 6.690606776431066e-05,
"loss": 1.8013,
"step": 1159
},
{
"epoch": 0.2801255735329631,
"grad_norm": 0.31820452213287354,
"learning_rate": 6.688314545464331e-05,
"loss": 1.9421,
"step": 1160
},
{
"epoch": 0.28036706109635356,
"grad_norm": 0.318487286567688,
"learning_rate": 6.686020703274347e-05,
"loss": 1.7597,
"step": 1161
},
{
"epoch": 0.28060854865974405,
"grad_norm": 0.3037980794906616,
"learning_rate": 6.683725251235911e-05,
"loss": 1.7721,
"step": 1162
},
{
"epoch": 0.28085003622313454,
"grad_norm": 0.3123769760131836,
"learning_rate": 6.681428190724789e-05,
"loss": 1.6083,
"step": 1163
},
{
"epoch": 0.281091523786525,
"grad_norm": 0.3207729458808899,
"learning_rate": 6.679129523117706e-05,
"loss": 1.7156,
"step": 1164
},
{
"epoch": 0.28133301134991545,
"grad_norm": 0.31085196137428284,
"learning_rate": 6.676829249792355e-05,
"loss": 1.6714,
"step": 1165
},
{
"epoch": 0.28157449891330594,
"grad_norm": 0.3331851661205292,
"learning_rate": 6.674527372127389e-05,
"loss": 1.9505,
"step": 1166
},
{
"epoch": 0.2818159864766964,
"grad_norm": 0.3235446512699127,
"learning_rate": 6.67222389150242e-05,
"loss": 1.6968,
"step": 1167
},
{
"epoch": 0.2820574740400869,
"grad_norm": 0.31274673342704773,
"learning_rate": 6.66991880929803e-05,
"loss": 1.7527,
"step": 1168
},
{
"epoch": 0.2822989616034774,
"grad_norm": 0.29938244819641113,
"learning_rate": 6.667612126895748e-05,
"loss": 1.6292,
"step": 1169
},
{
"epoch": 0.2825404491668679,
"grad_norm": 0.3175697922706604,
"learning_rate": 6.665303845678072e-05,
"loss": 1.8182,
"step": 1170
},
{
"epoch": 0.28278193673025837,
"grad_norm": 0.315643310546875,
"learning_rate": 6.662993967028455e-05,
"loss": 1.6594,
"step": 1171
},
{
"epoch": 0.28302342429364885,
"grad_norm": 0.33789125084877014,
"learning_rate": 6.660682492331305e-05,
"loss": 1.7726,
"step": 1172
},
{
"epoch": 0.28326491185703934,
"grad_norm": 0.305123507976532,
"learning_rate": 6.65836942297199e-05,
"loss": 1.7628,
"step": 1173
},
{
"epoch": 0.2835063994204298,
"grad_norm": 0.31694090366363525,
"learning_rate": 6.656054760336834e-05,
"loss": 1.7854,
"step": 1174
},
{
"epoch": 0.2837478869838203,
"grad_norm": 0.3205931484699249,
"learning_rate": 6.653738505813114e-05,
"loss": 1.856,
"step": 1175
},
{
"epoch": 0.2839893745472108,
"grad_norm": 0.303684800863266,
"learning_rate": 6.651420660789061e-05,
"loss": 1.6458,
"step": 1176
},
{
"epoch": 0.2842308621106013,
"grad_norm": 0.32911720871925354,
"learning_rate": 6.649101226653857e-05,
"loss": 1.9941,
"step": 1177
},
{
"epoch": 0.28447234967399176,
"grad_norm": 0.33249083161354065,
"learning_rate": 6.646780204797644e-05,
"loss": 1.8222,
"step": 1178
},
{
"epoch": 0.28471383723738225,
"grad_norm": 0.316508948802948,
"learning_rate": 6.644457596611508e-05,
"loss": 1.6778,
"step": 1179
},
{
"epoch": 0.28495532480077274,
"grad_norm": 0.31347498297691345,
"learning_rate": 6.642133403487491e-05,
"loss": 1.6783,
"step": 1180
},
{
"epoch": 0.2851968123641632,
"grad_norm": 0.30862924456596375,
"learning_rate": 6.639807626818579e-05,
"loss": 1.7761,
"step": 1181
},
{
"epoch": 0.2854382999275537,
"grad_norm": 0.31827664375305176,
"learning_rate": 6.637480267998713e-05,
"loss": 1.7578,
"step": 1182
},
{
"epoch": 0.2856797874909442,
"grad_norm": 0.3379991352558136,
"learning_rate": 6.63515132842278e-05,
"loss": 1.9115,
"step": 1183
},
{
"epoch": 0.2859212750543347,
"grad_norm": 0.3422394394874573,
"learning_rate": 6.632820809486612e-05,
"loss": 1.9046,
"step": 1184
},
{
"epoch": 0.28616276261772516,
"grad_norm": 0.3095594644546509,
"learning_rate": 6.630488712586992e-05,
"loss": 1.6895,
"step": 1185
},
{
"epoch": 0.28640425018111565,
"grad_norm": 0.31483933329582214,
"learning_rate": 6.628155039121649e-05,
"loss": 1.7782,
"step": 1186
},
{
"epoch": 0.28664573774450613,
"grad_norm": 0.30376529693603516,
"learning_rate": 6.625819790489248e-05,
"loss": 1.7967,
"step": 1187
},
{
"epoch": 0.2868872253078966,
"grad_norm": 0.29914677143096924,
"learning_rate": 6.623482968089409e-05,
"loss": 1.6851,
"step": 1188
},
{
"epoch": 0.2871287128712871,
"grad_norm": 0.3073113262653351,
"learning_rate": 6.62114457332269e-05,
"loss": 1.779,
"step": 1189
},
{
"epoch": 0.2873702004346776,
"grad_norm": 0.3353249728679657,
"learning_rate": 6.618804607590593e-05,
"loss": 1.9511,
"step": 1190
},
{
"epoch": 0.2876116879980681,
"grad_norm": 0.38719838857650757,
"learning_rate": 6.616463072295559e-05,
"loss": 1.8926,
"step": 1191
},
{
"epoch": 0.28785317556145856,
"grad_norm": 0.3062868118286133,
"learning_rate": 6.614119968840974e-05,
"loss": 1.6365,
"step": 1192
},
{
"epoch": 0.28809466312484905,
"grad_norm": 0.3393278121948242,
"learning_rate": 6.611775298631159e-05,
"loss": 1.8572,
"step": 1193
},
{
"epoch": 0.28833615068823953,
"grad_norm": 0.29543522000312805,
"learning_rate": 6.609429063071377e-05,
"loss": 1.6415,
"step": 1194
},
{
"epoch": 0.28857763825163,
"grad_norm": 0.29946866631507874,
"learning_rate": 6.607081263567827e-05,
"loss": 1.6446,
"step": 1195
},
{
"epoch": 0.2888191258150205,
"grad_norm": 0.3174428343772888,
"learning_rate": 6.604731901527649e-05,
"loss": 1.8043,
"step": 1196
},
{
"epoch": 0.289060613378411,
"grad_norm": 0.3169965147972107,
"learning_rate": 6.602380978358918e-05,
"loss": 1.816,
"step": 1197
},
{
"epoch": 0.2893021009418015,
"grad_norm": 0.315725713968277,
"learning_rate": 6.600028495470642e-05,
"loss": 1.7574,
"step": 1198
},
{
"epoch": 0.28954358850519196,
"grad_norm": 0.3309673070907593,
"learning_rate": 6.597674454272765e-05,
"loss": 1.9127,
"step": 1199
},
{
"epoch": 0.28978507606858245,
"grad_norm": 0.28161945939064026,
"learning_rate": 6.595318856176169e-05,
"loss": 1.5288,
"step": 1200
},
{
"epoch": 0.29002656363197293,
"grad_norm": 0.3262319564819336,
"learning_rate": 6.592961702592662e-05,
"loss": 1.836,
"step": 1201
},
{
"epoch": 0.2902680511953634,
"grad_norm": 0.30109328031539917,
"learning_rate": 6.590602994934993e-05,
"loss": 1.6786,
"step": 1202
},
{
"epoch": 0.2905095387587539,
"grad_norm": 0.30040109157562256,
"learning_rate": 6.588242734616833e-05,
"loss": 1.6941,
"step": 1203
},
{
"epoch": 0.2907510263221444,
"grad_norm": 0.3145768344402313,
"learning_rate": 6.58588092305279e-05,
"loss": 1.6959,
"step": 1204
},
{
"epoch": 0.2909925138855349,
"grad_norm": 0.32511648535728455,
"learning_rate": 6.583517561658401e-05,
"loss": 1.6826,
"step": 1205
},
{
"epoch": 0.29123400144892536,
"grad_norm": 0.3018747866153717,
"learning_rate": 6.58115265185013e-05,
"loss": 1.7152,
"step": 1206
},
{
"epoch": 0.29147548901231585,
"grad_norm": 0.34387052059173584,
"learning_rate": 6.578786195045368e-05,
"loss": 1.8679,
"step": 1207
},
{
"epoch": 0.29171697657570633,
"grad_norm": 0.35244396328926086,
"learning_rate": 6.576418192662436e-05,
"loss": 1.9484,
"step": 1208
},
{
"epoch": 0.2919584641390968,
"grad_norm": 0.3194985091686249,
"learning_rate": 6.574048646120582e-05,
"loss": 1.7235,
"step": 1209
},
{
"epoch": 0.2921999517024873,
"grad_norm": 0.30800020694732666,
"learning_rate": 6.571677556839976e-05,
"loss": 1.7975,
"step": 1210
},
{
"epoch": 0.2924414392658778,
"grad_norm": 0.3275563716888428,
"learning_rate": 6.569304926241715e-05,
"loss": 2.0784,
"step": 1211
},
{
"epoch": 0.2926829268292683,
"grad_norm": 0.3347807824611664,
"learning_rate": 6.566930755747821e-05,
"loss": 1.9368,
"step": 1212
},
{
"epoch": 0.29292441439265876,
"grad_norm": 0.3164602220058441,
"learning_rate": 6.564555046781232e-05,
"loss": 1.7644,
"step": 1213
},
{
"epoch": 0.29316590195604925,
"grad_norm": 0.29280054569244385,
"learning_rate": 6.562177800765819e-05,
"loss": 1.5814,
"step": 1214
},
{
"epoch": 0.29340738951943973,
"grad_norm": 0.29822081327438354,
"learning_rate": 6.559799019126365e-05,
"loss": 1.657,
"step": 1215
},
{
"epoch": 0.2936488770828302,
"grad_norm": 0.30289626121520996,
"learning_rate": 6.557418703288578e-05,
"loss": 1.8473,
"step": 1216
},
{
"epoch": 0.2938903646462207,
"grad_norm": 0.301372766494751,
"learning_rate": 6.555036854679083e-05,
"loss": 1.6417,
"step": 1217
},
{
"epoch": 0.2941318522096112,
"grad_norm": 0.31504714488983154,
"learning_rate": 6.552653474725427e-05,
"loss": 1.8229,
"step": 1218
},
{
"epoch": 0.2943733397730017,
"grad_norm": 0.31168702244758606,
"learning_rate": 6.550268564856071e-05,
"loss": 1.8771,
"step": 1219
},
{
"epoch": 0.29461482733639216,
"grad_norm": 0.31231966614723206,
"learning_rate": 6.547882126500395e-05,
"loss": 1.7629,
"step": 1220
},
{
"epoch": 0.29485631489978265,
"grad_norm": 0.33414316177368164,
"learning_rate": 6.545494161088696e-05,
"loss": 1.8771,
"step": 1221
},
{
"epoch": 0.29509780246317313,
"grad_norm": 0.3109551966190338,
"learning_rate": 6.543104670052183e-05,
"loss": 1.7909,
"step": 1222
},
{
"epoch": 0.2953392900265636,
"grad_norm": 0.31267592310905457,
"learning_rate": 6.540713654822984e-05,
"loss": 1.739,
"step": 1223
},
{
"epoch": 0.2955807775899541,
"grad_norm": 0.3014405071735382,
"learning_rate": 6.538321116834135e-05,
"loss": 1.6701,
"step": 1224
},
{
"epoch": 0.2958222651533446,
"grad_norm": 0.30508482456207275,
"learning_rate": 6.535927057519591e-05,
"loss": 1.6512,
"step": 1225
},
{
"epoch": 0.2960637527167351,
"grad_norm": 0.314610093832016,
"learning_rate": 6.533531478314212e-05,
"loss": 1.7084,
"step": 1226
},
{
"epoch": 0.29630524028012556,
"grad_norm": 0.33772405982017517,
"learning_rate": 6.531134380653774e-05,
"loss": 1.7774,
"step": 1227
},
{
"epoch": 0.29654672784351604,
"grad_norm": 0.3018490672111511,
"learning_rate": 6.52873576597496e-05,
"loss": 1.6606,
"step": 1228
},
{
"epoch": 0.29678821540690653,
"grad_norm": 0.30412787199020386,
"learning_rate": 6.526335635715365e-05,
"loss": 1.6831,
"step": 1229
},
{
"epoch": 0.297029702970297,
"grad_norm": 0.3108169734477997,
"learning_rate": 6.523933991313491e-05,
"loss": 1.6385,
"step": 1230
},
{
"epoch": 0.2972711905336875,
"grad_norm": 0.2972755432128906,
"learning_rate": 6.521530834208748e-05,
"loss": 1.6056,
"step": 1231
},
{
"epoch": 0.297512678097078,
"grad_norm": 0.3079698085784912,
"learning_rate": 6.519126165841449e-05,
"loss": 1.6153,
"step": 1232
},
{
"epoch": 0.2977541656604685,
"grad_norm": 0.31394216418266296,
"learning_rate": 6.516719987652819e-05,
"loss": 1.8146,
"step": 1233
},
{
"epoch": 0.29799565322385896,
"grad_norm": 0.3352009952068329,
"learning_rate": 6.514312301084983e-05,
"loss": 1.8685,
"step": 1234
},
{
"epoch": 0.29823714078724944,
"grad_norm": 0.3122904598712921,
"learning_rate": 6.511903107580973e-05,
"loss": 1.6114,
"step": 1235
},
{
"epoch": 0.29847862835063993,
"grad_norm": 0.32792016863822937,
"learning_rate": 6.509492408584723e-05,
"loss": 1.7083,
"step": 1236
},
{
"epoch": 0.2987201159140304,
"grad_norm": 0.3028082549571991,
"learning_rate": 6.507080205541068e-05,
"loss": 1.7556,
"step": 1237
},
{
"epoch": 0.2989616034774209,
"grad_norm": 0.3209961950778961,
"learning_rate": 6.504666499895746e-05,
"loss": 1.8044,
"step": 1238
},
{
"epoch": 0.2992030910408114,
"grad_norm": 0.3235771059989929,
"learning_rate": 6.502251293095394e-05,
"loss": 1.7247,
"step": 1239
},
{
"epoch": 0.29944457860420187,
"grad_norm": 0.32376086711883545,
"learning_rate": 6.499834586587552e-05,
"loss": 1.9585,
"step": 1240
},
{
"epoch": 0.29968606616759236,
"grad_norm": 0.33287283778190613,
"learning_rate": 6.497416381820656e-05,
"loss": 1.7671,
"step": 1241
},
{
"epoch": 0.29992755373098284,
"grad_norm": 0.3164242208003998,
"learning_rate": 6.494996680244044e-05,
"loss": 1.6698,
"step": 1242
},
{
"epoch": 0.30016904129437333,
"grad_norm": 0.31160667538642883,
"learning_rate": 6.49257548330794e-05,
"loss": 1.6573,
"step": 1243
},
{
"epoch": 0.3004105288577638,
"grad_norm": 0.3194431662559509,
"learning_rate": 6.49015279246348e-05,
"loss": 1.7531,
"step": 1244
},
{
"epoch": 0.3006520164211543,
"grad_norm": 0.3261190950870514,
"learning_rate": 6.487728609162684e-05,
"loss": 1.7351,
"step": 1245
},
{
"epoch": 0.3008935039845448,
"grad_norm": 0.31367936730384827,
"learning_rate": 6.48530293485847e-05,
"loss": 1.7731,
"step": 1246
},
{
"epoch": 0.30113499154793527,
"grad_norm": 0.3270440995693207,
"learning_rate": 6.48287577100465e-05,
"loss": 1.81,
"step": 1247
},
{
"epoch": 0.30137647911132576,
"grad_norm": 0.3517354428768158,
"learning_rate": 6.480447119055929e-05,
"loss": 1.9651,
"step": 1248
},
{
"epoch": 0.30161796667471624,
"grad_norm": 0.33357125520706177,
"learning_rate": 6.478016980467901e-05,
"loss": 1.8683,
"step": 1249
},
{
"epoch": 0.30185945423810673,
"grad_norm": 0.32203903794288635,
"learning_rate": 6.475585356697056e-05,
"loss": 1.6253,
"step": 1250
},
{
"epoch": 0.3021009418014972,
"grad_norm": 0.33326178789138794,
"learning_rate": 6.473152249200771e-05,
"loss": 1.848,
"step": 1251
},
{
"epoch": 0.3023424293648877,
"grad_norm": 0.30899542570114136,
"learning_rate": 6.470717659437309e-05,
"loss": 1.7319,
"step": 1252
},
{
"epoch": 0.3025839169282782,
"grad_norm": 0.30828720331192017,
"learning_rate": 6.46828158886583e-05,
"loss": 1.6604,
"step": 1253
},
{
"epoch": 0.30282540449166867,
"grad_norm": 0.3537421226501465,
"learning_rate": 6.465844038946374e-05,
"loss": 1.7714,
"step": 1254
},
{
"epoch": 0.30306689205505916,
"grad_norm": 0.31548118591308594,
"learning_rate": 6.463405011139869e-05,
"loss": 1.7093,
"step": 1255
},
{
"epoch": 0.30330837961844964,
"grad_norm": 0.32706400752067566,
"learning_rate": 6.460964506908133e-05,
"loss": 1.8115,
"step": 1256
},
{
"epoch": 0.3035498671818401,
"grad_norm": 0.317264199256897,
"learning_rate": 6.458522527713862e-05,
"loss": 1.7378,
"step": 1257
},
{
"epoch": 0.3037913547452306,
"grad_norm": 0.3122968375682831,
"learning_rate": 6.456079075020644e-05,
"loss": 1.783,
"step": 1258
},
{
"epoch": 0.3040328423086211,
"grad_norm": 0.37204188108444214,
"learning_rate": 6.453634150292943e-05,
"loss": 2.2071,
"step": 1259
},
{
"epoch": 0.3042743298720116,
"grad_norm": 0.3135020434856415,
"learning_rate": 6.451187754996109e-05,
"loss": 1.8169,
"step": 1260
},
{
"epoch": 0.30451581743540207,
"grad_norm": 0.30854514241218567,
"learning_rate": 6.448739890596373e-05,
"loss": 1.4994,
"step": 1261
},
{
"epoch": 0.30475730499879256,
"grad_norm": 0.3245062232017517,
"learning_rate": 6.446290558560845e-05,
"loss": 1.794,
"step": 1262
},
{
"epoch": 0.30499879256218304,
"grad_norm": 0.31106555461883545,
"learning_rate": 6.443839760357517e-05,
"loss": 1.706,
"step": 1263
},
{
"epoch": 0.3052402801255735,
"grad_norm": 0.3059476912021637,
"learning_rate": 6.441387497455259e-05,
"loss": 1.7567,
"step": 1264
},
{
"epoch": 0.305481767688964,
"grad_norm": 0.3407411575317383,
"learning_rate": 6.438933771323816e-05,
"loss": 1.8181,
"step": 1265
},
{
"epoch": 0.3057232552523545,
"grad_norm": 0.3178406357765198,
"learning_rate": 6.436478583433812e-05,
"loss": 1.5195,
"step": 1266
},
{
"epoch": 0.305964742815745,
"grad_norm": 0.31224748492240906,
"learning_rate": 6.43402193525675e-05,
"loss": 1.5679,
"step": 1267
},
{
"epoch": 0.30620623037913547,
"grad_norm": 0.36681729555130005,
"learning_rate": 6.431563828265005e-05,
"loss": 2.1015,
"step": 1268
},
{
"epoch": 0.30644771794252595,
"grad_norm": 0.315141886472702,
"learning_rate": 6.429104263931825e-05,
"loss": 1.6646,
"step": 1269
},
{
"epoch": 0.30668920550591644,
"grad_norm": 0.3100356161594391,
"learning_rate": 6.426643243731336e-05,
"loss": 1.6975,
"step": 1270
},
{
"epoch": 0.3069306930693069,
"grad_norm": 0.33374282717704773,
"learning_rate": 6.424180769138531e-05,
"loss": 1.8585,
"step": 1271
},
{
"epoch": 0.3071721806326974,
"grad_norm": 0.3375101685523987,
"learning_rate": 6.42171684162928e-05,
"loss": 1.7872,
"step": 1272
},
{
"epoch": 0.3074136681960879,
"grad_norm": 0.3159698247909546,
"learning_rate": 6.41925146268032e-05,
"loss": 1.8521,
"step": 1273
},
{
"epoch": 0.3076551557594784,
"grad_norm": 0.3374696969985962,
"learning_rate": 6.416784633769261e-05,
"loss": 1.7629,
"step": 1274
},
{
"epoch": 0.30789664332286887,
"grad_norm": 0.3099079430103302,
"learning_rate": 6.414316356374578e-05,
"loss": 1.6937,
"step": 1275
},
{
"epoch": 0.30813813088625935,
"grad_norm": 0.30903059244155884,
"learning_rate": 6.411846631975618e-05,
"loss": 1.855,
"step": 1276
},
{
"epoch": 0.30837961844964984,
"grad_norm": 0.30359727144241333,
"learning_rate": 6.409375462052594e-05,
"loss": 1.7491,
"step": 1277
},
{
"epoch": 0.3086211060130403,
"grad_norm": 0.30317601561546326,
"learning_rate": 6.406902848086582e-05,
"loss": 1.6889,
"step": 1278
},
{
"epoch": 0.3088625935764308,
"grad_norm": 0.2974912226200104,
"learning_rate": 6.40442879155953e-05,
"loss": 1.6461,
"step": 1279
},
{
"epoch": 0.3091040811398213,
"grad_norm": 0.3059878647327423,
"learning_rate": 6.401953293954246e-05,
"loss": 1.5723,
"step": 1280
},
{
"epoch": 0.3093455687032118,
"grad_norm": 0.31517407298088074,
"learning_rate": 6.399476356754403e-05,
"loss": 1.6743,
"step": 1281
},
{
"epoch": 0.30958705626660227,
"grad_norm": 0.3091956377029419,
"learning_rate": 6.396997981444537e-05,
"loss": 1.7329,
"step": 1282
},
{
"epoch": 0.30982854382999275,
"grad_norm": 0.3104307949542999,
"learning_rate": 6.394518169510044e-05,
"loss": 1.7746,
"step": 1283
},
{
"epoch": 0.31007003139338324,
"grad_norm": 0.3237158954143524,
"learning_rate": 6.392036922437185e-05,
"loss": 1.6943,
"step": 1284
},
{
"epoch": 0.3103115189567737,
"grad_norm": 0.33262524008750916,
"learning_rate": 6.389554241713077e-05,
"loss": 1.81,
"step": 1285
},
{
"epoch": 0.3105530065201642,
"grad_norm": 0.3406033515930176,
"learning_rate": 6.387070128825698e-05,
"loss": 1.6864,
"step": 1286
},
{
"epoch": 0.3107944940835547,
"grad_norm": 0.3164110481739044,
"learning_rate": 6.384584585263885e-05,
"loss": 1.727,
"step": 1287
},
{
"epoch": 0.3110359816469452,
"grad_norm": 0.32497438788414,
"learning_rate": 6.382097612517333e-05,
"loss": 1.7212,
"step": 1288
},
{
"epoch": 0.31127746921033567,
"grad_norm": 0.3328503668308258,
"learning_rate": 6.37960921207659e-05,
"loss": 1.6775,
"step": 1289
},
{
"epoch": 0.31151895677372615,
"grad_norm": 0.32597246766090393,
"learning_rate": 6.377119385433063e-05,
"loss": 1.6844,
"step": 1290
},
{
"epoch": 0.31176044433711664,
"grad_norm": 0.31859007477760315,
"learning_rate": 6.374628134079012e-05,
"loss": 1.6906,
"step": 1291
},
{
"epoch": 0.3120019319005071,
"grad_norm": 0.31793224811553955,
"learning_rate": 6.372135459507556e-05,
"loss": 1.6995,
"step": 1292
},
{
"epoch": 0.3122434194638976,
"grad_norm": 0.3268589973449707,
"learning_rate": 6.369641363212656e-05,
"loss": 1.6803,
"step": 1293
},
{
"epoch": 0.3124849070272881,
"grad_norm": 0.31107431650161743,
"learning_rate": 6.367145846689138e-05,
"loss": 1.6134,
"step": 1294
},
{
"epoch": 0.3127263945906786,
"grad_norm": 0.3370269536972046,
"learning_rate": 6.36464891143267e-05,
"loss": 1.8846,
"step": 1295
},
{
"epoch": 0.31296788215406907,
"grad_norm": 0.3170960247516632,
"learning_rate": 6.362150558939772e-05,
"loss": 1.7981,
"step": 1296
},
{
"epoch": 0.31320936971745955,
"grad_norm": 0.33306363224983215,
"learning_rate": 6.359650790707818e-05,
"loss": 1.8478,
"step": 1297
},
{
"epoch": 0.31345085728085004,
"grad_norm": 0.3303205072879791,
"learning_rate": 6.357149608235025e-05,
"loss": 1.8357,
"step": 1298
},
{
"epoch": 0.3136923448442405,
"grad_norm": 0.3151163160800934,
"learning_rate": 6.354647013020461e-05,
"loss": 1.4975,
"step": 1299
},
{
"epoch": 0.313933832407631,
"grad_norm": 0.3328797221183777,
"learning_rate": 6.35214300656404e-05,
"loss": 1.7985,
"step": 1300
},
{
"epoch": 0.3141753199710215,
"grad_norm": 0.3165968656539917,
"learning_rate": 6.34963759036652e-05,
"loss": 1.8291,
"step": 1301
},
{
"epoch": 0.314416807534412,
"grad_norm": 0.3316687047481537,
"learning_rate": 6.347130765929507e-05,
"loss": 1.8008,
"step": 1302
},
{
"epoch": 0.31465829509780247,
"grad_norm": 0.3069915175437927,
"learning_rate": 6.344622534755449e-05,
"loss": 1.6981,
"step": 1303
},
{
"epoch": 0.31489978266119295,
"grad_norm": 0.33111026883125305,
"learning_rate": 6.342112898347635e-05,
"loss": 1.8564,
"step": 1304
},
{
"epoch": 0.31514127022458344,
"grad_norm": 0.31829777359962463,
"learning_rate": 6.339601858210202e-05,
"loss": 1.8491,
"step": 1305
},
{
"epoch": 0.3153827577879739,
"grad_norm": 0.29953789710998535,
"learning_rate": 6.337089415848124e-05,
"loss": 1.686,
"step": 1306
},
{
"epoch": 0.3156242453513644,
"grad_norm": 0.32130008935928345,
"learning_rate": 6.334575572767214e-05,
"loss": 1.7548,
"step": 1307
},
{
"epoch": 0.3158657329147549,
"grad_norm": 0.31819820404052734,
"learning_rate": 6.332060330474131e-05,
"loss": 1.6416,
"step": 1308
},
{
"epoch": 0.3161072204781454,
"grad_norm": 0.3157431185245514,
"learning_rate": 6.329543690476368e-05,
"loss": 1.6687,
"step": 1309
},
{
"epoch": 0.31634870804153586,
"grad_norm": 0.32807284593582153,
"learning_rate": 6.327025654282253e-05,
"loss": 1.7713,
"step": 1310
},
{
"epoch": 0.31659019560492635,
"grad_norm": 0.32748138904571533,
"learning_rate": 6.324506223400957e-05,
"loss": 1.7929,
"step": 1311
},
{
"epoch": 0.31683168316831684,
"grad_norm": 0.3302832841873169,
"learning_rate": 6.321985399342481e-05,
"loss": 1.8137,
"step": 1312
},
{
"epoch": 0.3170731707317073,
"grad_norm": 0.30940601229667664,
"learning_rate": 6.319463183617669e-05,
"loss": 1.6142,
"step": 1313
},
{
"epoch": 0.3173146582950978,
"grad_norm": 0.30194124579429626,
"learning_rate": 6.316939577738189e-05,
"loss": 1.7115,
"step": 1314
},
{
"epoch": 0.3175561458584883,
"grad_norm": 0.3257052004337311,
"learning_rate": 6.314414583216548e-05,
"loss": 1.7879,
"step": 1315
},
{
"epoch": 0.3177976334218788,
"grad_norm": 0.324390709400177,
"learning_rate": 6.311888201566088e-05,
"loss": 1.9028,
"step": 1316
},
{
"epoch": 0.31803912098526926,
"grad_norm": 0.3202170431613922,
"learning_rate": 6.309360434300975e-05,
"loss": 1.7865,
"step": 1317
},
{
"epoch": 0.31828060854865975,
"grad_norm": 0.3127957284450531,
"learning_rate": 6.306831282936212e-05,
"loss": 1.7583,
"step": 1318
},
{
"epoch": 0.31852209611205023,
"grad_norm": 0.3177083730697632,
"learning_rate": 6.304300748987627e-05,
"loss": 1.7228,
"step": 1319
},
{
"epoch": 0.3187635836754407,
"grad_norm": 0.33234596252441406,
"learning_rate": 6.30176883397188e-05,
"loss": 1.8422,
"step": 1320
},
{
"epoch": 0.3190050712388312,
"grad_norm": 0.3085367679595947,
"learning_rate": 6.299235539406456e-05,
"loss": 1.7398,
"step": 1321
},
{
"epoch": 0.3192465588022217,
"grad_norm": 0.32077756524086,
"learning_rate": 6.296700866809667e-05,
"loss": 1.6157,
"step": 1322
},
{
"epoch": 0.3194880463656122,
"grad_norm": 0.30447328090667725,
"learning_rate": 6.294164817700655e-05,
"loss": 1.6457,
"step": 1323
},
{
"epoch": 0.31972953392900266,
"grad_norm": 0.3148518204689026,
"learning_rate": 6.291627393599383e-05,
"loss": 1.8575,
"step": 1324
},
{
"epoch": 0.31997102149239315,
"grad_norm": 0.30383065342903137,
"learning_rate": 6.289088596026638e-05,
"loss": 1.7007,
"step": 1325
},
{
"epoch": 0.32021250905578363,
"grad_norm": 0.31243109703063965,
"learning_rate": 6.286548426504033e-05,
"loss": 1.7474,
"step": 1326
},
{
"epoch": 0.3204539966191741,
"grad_norm": 0.3217732906341553,
"learning_rate": 6.284006886553998e-05,
"loss": 1.7636,
"step": 1327
},
{
"epoch": 0.3206954841825646,
"grad_norm": 0.3129735589027405,
"learning_rate": 6.281463977699793e-05,
"loss": 1.7425,
"step": 1328
},
{
"epoch": 0.3209369717459551,
"grad_norm": 0.33060258626937866,
"learning_rate": 6.278919701465489e-05,
"loss": 1.8192,
"step": 1329
},
{
"epoch": 0.3211784593093456,
"grad_norm": 0.31487327814102173,
"learning_rate": 6.276374059375983e-05,
"loss": 1.783,
"step": 1330
},
{
"epoch": 0.32141994687273606,
"grad_norm": 0.31960222125053406,
"learning_rate": 6.273827052956986e-05,
"loss": 1.7372,
"step": 1331
},
{
"epoch": 0.32166143443612655,
"grad_norm": 0.30207306146621704,
"learning_rate": 6.271278683735033e-05,
"loss": 1.5479,
"step": 1332
},
{
"epoch": 0.32190292199951703,
"grad_norm": 0.3223114013671875,
"learning_rate": 6.26872895323747e-05,
"loss": 1.8627,
"step": 1333
},
{
"epoch": 0.3221444095629075,
"grad_norm": 0.3229098618030548,
"learning_rate": 6.26617786299246e-05,
"loss": 1.8045,
"step": 1334
},
{
"epoch": 0.322385897126298,
"grad_norm": 0.2999171316623688,
"learning_rate": 6.263625414528983e-05,
"loss": 1.6994,
"step": 1335
},
{
"epoch": 0.3226273846896885,
"grad_norm": 0.30813145637512207,
"learning_rate": 6.261071609376832e-05,
"loss": 1.7092,
"step": 1336
},
{
"epoch": 0.322868872253079,
"grad_norm": 0.31752750277519226,
"learning_rate": 6.258516449066612e-05,
"loss": 1.8286,
"step": 1337
},
{
"epoch": 0.32311035981646946,
"grad_norm": 0.3602254390716553,
"learning_rate": 6.255959935129742e-05,
"loss": 1.9201,
"step": 1338
},
{
"epoch": 0.32335184737985995,
"grad_norm": 0.3091624975204468,
"learning_rate": 6.253402069098451e-05,
"loss": 1.7353,
"step": 1339
},
{
"epoch": 0.32359333494325043,
"grad_norm": 0.31246787309646606,
"learning_rate": 6.250842852505778e-05,
"loss": 1.7084,
"step": 1340
},
{
"epoch": 0.3238348225066409,
"grad_norm": 0.32028380036354065,
"learning_rate": 6.248282286885574e-05,
"loss": 1.7276,
"step": 1341
},
{
"epoch": 0.3240763100700314,
"grad_norm": 0.32509827613830566,
"learning_rate": 6.245720373772496e-05,
"loss": 1.8808,
"step": 1342
},
{
"epoch": 0.3243177976334219,
"grad_norm": 0.34290429949760437,
"learning_rate": 6.243157114702009e-05,
"loss": 1.9521,
"step": 1343
},
{
"epoch": 0.3245592851968124,
"grad_norm": 0.3149389326572418,
"learning_rate": 6.240592511210385e-05,
"loss": 1.8657,
"step": 1344
},
{
"epoch": 0.32480077276020286,
"grad_norm": 0.3112652599811554,
"learning_rate": 6.238026564834702e-05,
"loss": 1.6536,
"step": 1345
},
{
"epoch": 0.32504226032359335,
"grad_norm": 0.30453184247016907,
"learning_rate": 6.235459277112844e-05,
"loss": 1.574,
"step": 1346
},
{
"epoch": 0.32528374788698383,
"grad_norm": 0.3088798522949219,
"learning_rate": 6.232890649583496e-05,
"loss": 1.6068,
"step": 1347
},
{
"epoch": 0.3255252354503743,
"grad_norm": 0.3275122046470642,
"learning_rate": 6.230320683786148e-05,
"loss": 1.9809,
"step": 1348
},
{
"epoch": 0.3257667230137648,
"grad_norm": 0.31893762946128845,
"learning_rate": 6.227749381261092e-05,
"loss": 1.7996,
"step": 1349
},
{
"epoch": 0.3260082105771553,
"grad_norm": 0.3026633858680725,
"learning_rate": 6.22517674354942e-05,
"loss": 1.5886,
"step": 1350
},
{
"epoch": 0.3262496981405458,
"grad_norm": 0.31581586599349976,
"learning_rate": 6.222602772193028e-05,
"loss": 1.7078,
"step": 1351
},
{
"epoch": 0.32649118570393626,
"grad_norm": 0.30987709760665894,
"learning_rate": 6.220027468734605e-05,
"loss": 1.7288,
"step": 1352
},
{
"epoch": 0.32673267326732675,
"grad_norm": 0.3298446834087372,
"learning_rate": 6.217450834717644e-05,
"loss": 1.8196,
"step": 1353
},
{
"epoch": 0.32697416083071723,
"grad_norm": 0.30796289443969727,
"learning_rate": 6.214872871686433e-05,
"loss": 1.7249,
"step": 1354
},
{
"epoch": 0.3272156483941077,
"grad_norm": 0.30453404784202576,
"learning_rate": 6.212293581186055e-05,
"loss": 1.6672,
"step": 1355
},
{
"epoch": 0.3274571359574982,
"grad_norm": 0.3229547142982483,
"learning_rate": 6.209712964762393e-05,
"loss": 1.8192,
"step": 1356
},
{
"epoch": 0.3276986235208887,
"grad_norm": 0.33560508489608765,
"learning_rate": 6.20713102396212e-05,
"loss": 1.7354,
"step": 1357
},
{
"epoch": 0.3279401110842792,
"grad_norm": 0.3196660280227661,
"learning_rate": 6.204547760332705e-05,
"loss": 1.6965,
"step": 1358
},
{
"epoch": 0.32818159864766966,
"grad_norm": 0.3183029592037201,
"learning_rate": 6.201963175422412e-05,
"loss": 1.785,
"step": 1359
},
{
"epoch": 0.32842308621106014,
"grad_norm": 0.3159939646720886,
"learning_rate": 6.199377270780291e-05,
"loss": 1.6972,
"step": 1360
},
{
"epoch": 0.32866457377445063,
"grad_norm": 0.33196038007736206,
"learning_rate": 6.19679004795619e-05,
"loss": 1.7643,
"step": 1361
},
{
"epoch": 0.3289060613378411,
"grad_norm": 0.32369866967201233,
"learning_rate": 6.194201508500742e-05,
"loss": 1.8385,
"step": 1362
},
{
"epoch": 0.3291475489012316,
"grad_norm": 0.31358596682548523,
"learning_rate": 6.191611653965371e-05,
"loss": 1.826,
"step": 1363
},
{
"epoch": 0.3293890364646221,
"grad_norm": 0.3112541735172272,
"learning_rate": 6.189020485902287e-05,
"loss": 1.7407,
"step": 1364
},
{
"epoch": 0.3296305240280126,
"grad_norm": 0.3198995590209961,
"learning_rate": 6.186428005864492e-05,
"loss": 1.5329,
"step": 1365
},
{
"epoch": 0.32987201159140306,
"grad_norm": 0.3354710340499878,
"learning_rate": 6.183834215405772e-05,
"loss": 1.7694,
"step": 1366
},
{
"epoch": 0.33011349915479354,
"grad_norm": 0.3100753426551819,
"learning_rate": 6.181239116080693e-05,
"loss": 1.7828,
"step": 1367
},
{
"epoch": 0.33035498671818403,
"grad_norm": 0.3105238974094391,
"learning_rate": 6.178642709444616e-05,
"loss": 1.8108,
"step": 1368
},
{
"epoch": 0.3305964742815745,
"grad_norm": 0.3389638066291809,
"learning_rate": 6.176044997053677e-05,
"loss": 1.9256,
"step": 1369
},
{
"epoch": 0.330837961844965,
"grad_norm": 0.30497318506240845,
"learning_rate": 6.173445980464799e-05,
"loss": 1.6612,
"step": 1370
},
{
"epoch": 0.3310794494083555,
"grad_norm": 0.3220541179180145,
"learning_rate": 6.170845661235681e-05,
"loss": 1.6502,
"step": 1371
},
{
"epoch": 0.33132093697174597,
"grad_norm": 0.3109511137008667,
"learning_rate": 6.168244040924813e-05,
"loss": 1.7243,
"step": 1372
},
{
"epoch": 0.33156242453513646,
"grad_norm": 0.31736430525779724,
"learning_rate": 6.165641121091454e-05,
"loss": 1.8114,
"step": 1373
},
{
"epoch": 0.33180391209852694,
"grad_norm": 0.3181619644165039,
"learning_rate": 6.163036903295649e-05,
"loss": 1.714,
"step": 1374
},
{
"epoch": 0.33204539966191743,
"grad_norm": 0.33182013034820557,
"learning_rate": 6.160431389098216e-05,
"loss": 1.809,
"step": 1375
},
{
"epoch": 0.3322868872253079,
"grad_norm": 0.33612844347953796,
"learning_rate": 6.157824580060756e-05,
"loss": 1.7409,
"step": 1376
},
{
"epoch": 0.3325283747886984,
"grad_norm": 0.33391138911247253,
"learning_rate": 6.155216477745638e-05,
"loss": 1.7668,
"step": 1377
},
{
"epoch": 0.3327698623520889,
"grad_norm": 0.3073732852935791,
"learning_rate": 6.152607083716015e-05,
"loss": 1.7319,
"step": 1378
},
{
"epoch": 0.33301134991547937,
"grad_norm": 0.31348085403442383,
"learning_rate": 6.149996399535806e-05,
"loss": 1.6169,
"step": 1379
},
{
"epoch": 0.33325283747886986,
"grad_norm": 0.32224661111831665,
"learning_rate": 6.147384426769711e-05,
"loss": 1.7747,
"step": 1380
},
{
"epoch": 0.33349432504226034,
"grad_norm": 0.3120202124118805,
"learning_rate": 6.144771166983195e-05,
"loss": 1.87,
"step": 1381
},
{
"epoch": 0.33373581260565083,
"grad_norm": 0.3108193278312683,
"learning_rate": 6.142156621742496e-05,
"loss": 1.7512,
"step": 1382
},
{
"epoch": 0.3339773001690413,
"grad_norm": 0.31070417165756226,
"learning_rate": 6.13954079261463e-05,
"loss": 1.8596,
"step": 1383
},
{
"epoch": 0.3342187877324318,
"grad_norm": 0.3115104138851166,
"learning_rate": 6.136923681167372e-05,
"loss": 1.6334,
"step": 1384
},
{
"epoch": 0.3344602752958223,
"grad_norm": 0.3107805550098419,
"learning_rate": 6.134305288969273e-05,
"loss": 1.7409,
"step": 1385
},
{
"epoch": 0.33470176285921277,
"grad_norm": 0.32333892583847046,
"learning_rate": 6.131685617589646e-05,
"loss": 1.831,
"step": 1386
},
{
"epoch": 0.33494325042260326,
"grad_norm": 0.3145526349544525,
"learning_rate": 6.129064668598574e-05,
"loss": 1.8139,
"step": 1387
},
{
"epoch": 0.33518473798599374,
"grad_norm": 0.3273543119430542,
"learning_rate": 6.12644244356691e-05,
"loss": 1.7674,
"step": 1388
},
{
"epoch": 0.3354262255493842,
"grad_norm": 0.31777769327163696,
"learning_rate": 6.123818944066259e-05,
"loss": 1.7356,
"step": 1389
},
{
"epoch": 0.3356677131127747,
"grad_norm": 0.30964168906211853,
"learning_rate": 6.121194171669003e-05,
"loss": 1.749,
"step": 1390
},
{
"epoch": 0.3359092006761652,
"grad_norm": 0.3542748689651489,
"learning_rate": 6.11856812794828e-05,
"loss": 1.856,
"step": 1391
},
{
"epoch": 0.3361506882395557,
"grad_norm": 0.32668453454971313,
"learning_rate": 6.115940814477994e-05,
"loss": 1.877,
"step": 1392
},
{
"epoch": 0.33639217580294617,
"grad_norm": 0.34220948815345764,
"learning_rate": 6.113312232832804e-05,
"loss": 1.688,
"step": 1393
},
{
"epoch": 0.33663366336633666,
"grad_norm": 0.3136855661869049,
"learning_rate": 6.110682384588133e-05,
"loss": 1.6078,
"step": 1394
},
{
"epoch": 0.33687515092972714,
"grad_norm": 0.337715208530426,
"learning_rate": 6.108051271320167e-05,
"loss": 1.8654,
"step": 1395
},
{
"epoch": 0.3371166384931176,
"grad_norm": 0.30137553811073303,
"learning_rate": 6.105418894605841e-05,
"loss": 1.5995,
"step": 1396
},
{
"epoch": 0.3373581260565081,
"grad_norm": 0.32862013578414917,
"learning_rate": 6.1027852560228555e-05,
"loss": 1.8154,
"step": 1397
},
{
"epoch": 0.3375996136198986,
"grad_norm": 0.3268672227859497,
"learning_rate": 6.1001503571496636e-05,
"loss": 1.7151,
"step": 1398
},
{
"epoch": 0.3378411011832891,
"grad_norm": 0.3278553783893585,
"learning_rate": 6.097514199565473e-05,
"loss": 1.771,
"step": 1399
},
{
"epoch": 0.33808258874667957,
"grad_norm": 0.3203633725643158,
"learning_rate": 6.0948767848502486e-05,
"loss": 1.6725,
"step": 1400
},
{
"epoch": 0.33832407631007005,
"grad_norm": 0.33434566855430603,
"learning_rate": 6.0922381145847065e-05,
"loss": 1.7686,
"step": 1401
},
{
"epoch": 0.33856556387346054,
"grad_norm": 0.3028900921344757,
"learning_rate": 6.089598190350316e-05,
"loss": 1.6449,
"step": 1402
},
{
"epoch": 0.338807051436851,
"grad_norm": 0.32168394327163696,
"learning_rate": 6.086957013729297e-05,
"loss": 1.76,
"step": 1403
},
{
"epoch": 0.3390485390002415,
"grad_norm": 0.3260248601436615,
"learning_rate": 6.084314586304624e-05,
"loss": 1.6925,
"step": 1404
},
{
"epoch": 0.339290026563632,
"grad_norm": 0.3169650137424469,
"learning_rate": 6.081670909660014e-05,
"loss": 1.7216,
"step": 1405
},
{
"epoch": 0.3395315141270225,
"grad_norm": 0.3010064661502838,
"learning_rate": 6.0790259853799386e-05,
"loss": 1.5303,
"step": 1406
},
{
"epoch": 0.33977300169041297,
"grad_norm": 0.32520854473114014,
"learning_rate": 6.076379815049617e-05,
"loss": 1.785,
"step": 1407
},
{
"epoch": 0.34001448925380345,
"grad_norm": 0.32523801922798157,
"learning_rate": 6.0737324002550095e-05,
"loss": 1.6572,
"step": 1408
},
{
"epoch": 0.34025597681719394,
"grad_norm": 0.3176769018173218,
"learning_rate": 6.0710837425828314e-05,
"loss": 1.5568,
"step": 1409
},
{
"epoch": 0.3404974643805844,
"grad_norm": 0.3224984407424927,
"learning_rate": 6.068433843620535e-05,
"loss": 1.6022,
"step": 1410
},
{
"epoch": 0.3407389519439749,
"grad_norm": 0.3200245797634125,
"learning_rate": 6.065782704956319e-05,
"loss": 1.7426,
"step": 1411
},
{
"epoch": 0.3409804395073654,
"grad_norm": 0.3169932961463928,
"learning_rate": 6.063130328179128e-05,
"loss": 1.6143,
"step": 1412
},
{
"epoch": 0.3412219270707559,
"grad_norm": 0.31651175022125244,
"learning_rate": 6.0604767148786436e-05,
"loss": 1.6513,
"step": 1413
},
{
"epoch": 0.34146341463414637,
"grad_norm": 0.3085106313228607,
"learning_rate": 6.0578218666452914e-05,
"loss": 1.759,
"step": 1414
},
{
"epoch": 0.34170490219753685,
"grad_norm": 0.328730046749115,
"learning_rate": 6.055165785070239e-05,
"loss": 1.9085,
"step": 1415
},
{
"epoch": 0.34194638976092734,
"grad_norm": 0.30749958753585815,
"learning_rate": 6.052508471745389e-05,
"loss": 1.644,
"step": 1416
},
{
"epoch": 0.3421878773243178,
"grad_norm": 0.3132942020893097,
"learning_rate": 6.049849928263385e-05,
"loss": 1.7456,
"step": 1417
},
{
"epoch": 0.3424293648877083,
"grad_norm": 0.3153761327266693,
"learning_rate": 6.047190156217607e-05,
"loss": 1.8136,
"step": 1418
},
{
"epoch": 0.3426708524510988,
"grad_norm": 0.2964738607406616,
"learning_rate": 6.0445291572021716e-05,
"loss": 1.657,
"step": 1419
},
{
"epoch": 0.3429123400144893,
"grad_norm": 0.3104841709136963,
"learning_rate": 6.04186693281193e-05,
"loss": 1.7264,
"step": 1420
},
{
"epoch": 0.34315382757787977,
"grad_norm": 0.35105088353157043,
"learning_rate": 6.0392034846424696e-05,
"loss": 1.8898,
"step": 1421
},
{
"epoch": 0.34339531514127025,
"grad_norm": 0.33985963463783264,
"learning_rate": 6.0365388142901096e-05,
"loss": 1.8255,
"step": 1422
},
{
"epoch": 0.34363680270466074,
"grad_norm": 0.3295535743236542,
"learning_rate": 6.0338729233519026e-05,
"loss": 1.6857,
"step": 1423
},
{
"epoch": 0.34387829026805117,
"grad_norm": 0.31867682933807373,
"learning_rate": 6.0312058134256314e-05,
"loss": 1.8694,
"step": 1424
},
{
"epoch": 0.34411977783144165,
"grad_norm": 0.3171629011631012,
"learning_rate": 6.0285374861098125e-05,
"loss": 1.7238,
"step": 1425
},
{
"epoch": 0.34436126539483214,
"grad_norm": 0.3434184193611145,
"learning_rate": 6.025867943003687e-05,
"loss": 1.6924,
"step": 1426
},
{
"epoch": 0.3446027529582226,
"grad_norm": 0.3540340065956116,
"learning_rate": 6.02319718570723e-05,
"loss": 1.8509,
"step": 1427
},
{
"epoch": 0.3448442405216131,
"grad_norm": 0.3207017779350281,
"learning_rate": 6.020525215821142e-05,
"loss": 1.7741,
"step": 1428
},
{
"epoch": 0.3450857280850036,
"grad_norm": 0.31496745347976685,
"learning_rate": 6.0178520349468475e-05,
"loss": 1.7462,
"step": 1429
},
{
"epoch": 0.3453272156483941,
"grad_norm": 0.3210442066192627,
"learning_rate": 6.0151776446865015e-05,
"loss": 1.7108,
"step": 1430
},
{
"epoch": 0.34556870321178457,
"grad_norm": 0.3334159255027771,
"learning_rate": 6.012502046642982e-05,
"loss": 1.6989,
"step": 1431
},
{
"epoch": 0.34581019077517505,
"grad_norm": 0.32715243101119995,
"learning_rate": 6.00982524241989e-05,
"loss": 1.9606,
"step": 1432
},
{
"epoch": 0.34605167833856554,
"grad_norm": 0.3288145065307617,
"learning_rate": 6.007147233621551e-05,
"loss": 1.9522,
"step": 1433
},
{
"epoch": 0.346293165901956,
"grad_norm": 0.30984047055244446,
"learning_rate": 6.004468021853011e-05,
"loss": 1.7703,
"step": 1434
},
{
"epoch": 0.3465346534653465,
"grad_norm": 0.323690265417099,
"learning_rate": 6.001787608720037e-05,
"loss": 1.7608,
"step": 1435
},
{
"epoch": 0.346776141028737,
"grad_norm": 0.33015599846839905,
"learning_rate": 5.9991059958291176e-05,
"loss": 1.8368,
"step": 1436
},
{
"epoch": 0.3470176285921275,
"grad_norm": 0.3160457909107208,
"learning_rate": 5.9964231847874596e-05,
"loss": 1.8098,
"step": 1437
},
{
"epoch": 0.34725911615551797,
"grad_norm": 0.30281051993370056,
"learning_rate": 5.9937391772029855e-05,
"loss": 1.7887,
"step": 1438
},
{
"epoch": 0.34750060371890845,
"grad_norm": 0.315327525138855,
"learning_rate": 5.9910539746843405e-05,
"loss": 1.7365,
"step": 1439
},
{
"epoch": 0.34774209128229894,
"grad_norm": 0.3132166564464569,
"learning_rate": 5.988367578840881e-05,
"loss": 1.6718,
"step": 1440
},
{
"epoch": 0.3479835788456894,
"grad_norm": 0.32553204894065857,
"learning_rate": 5.985679991282679e-05,
"loss": 1.8002,
"step": 1441
},
{
"epoch": 0.3482250664090799,
"grad_norm": 0.3237243592739105,
"learning_rate": 5.9829912136205236e-05,
"loss": 1.8928,
"step": 1442
},
{
"epoch": 0.3484665539724704,
"grad_norm": 0.32126304507255554,
"learning_rate": 5.980301247465917e-05,
"loss": 1.6859,
"step": 1443
},
{
"epoch": 0.3487080415358609,
"grad_norm": 0.3168717920780182,
"learning_rate": 5.977610094431068e-05,
"loss": 1.8302,
"step": 1444
},
{
"epoch": 0.34894952909925137,
"grad_norm": 0.3163128197193146,
"learning_rate": 5.9749177561289063e-05,
"loss": 1.6948,
"step": 1445
},
{
"epoch": 0.34919101666264185,
"grad_norm": 0.3239203989505768,
"learning_rate": 5.9722242341730635e-05,
"loss": 1.7526,
"step": 1446
},
{
"epoch": 0.34943250422603234,
"grad_norm": 0.30871322751045227,
"learning_rate": 5.969529530177884e-05,
"loss": 1.575,
"step": 1447
},
{
"epoch": 0.3496739917894228,
"grad_norm": 0.3129870295524597,
"learning_rate": 5.966833645758422e-05,
"loss": 1.8075,
"step": 1448
},
{
"epoch": 0.3499154793528133,
"grad_norm": 0.3211073875427246,
"learning_rate": 5.9641365825304355e-05,
"loss": 1.763,
"step": 1449
},
{
"epoch": 0.3501569669162038,
"grad_norm": 0.32273295521736145,
"learning_rate": 5.9614383421103944e-05,
"loss": 1.8933,
"step": 1450
},
{
"epoch": 0.3503984544795943,
"grad_norm": 0.31030890345573425,
"learning_rate": 5.9587389261154686e-05,
"loss": 1.6552,
"step": 1451
},
{
"epoch": 0.35063994204298476,
"grad_norm": 0.31312838196754456,
"learning_rate": 5.956038336163534e-05,
"loss": 1.6923,
"step": 1452
},
{
"epoch": 0.35088142960637525,
"grad_norm": 0.3211262822151184,
"learning_rate": 5.9533365738731734e-05,
"loss": 1.7661,
"step": 1453
},
{
"epoch": 0.35112291716976574,
"grad_norm": 0.3056935966014862,
"learning_rate": 5.95063364086367e-05,
"loss": 1.6947,
"step": 1454
},
{
"epoch": 0.3513644047331562,
"grad_norm": 0.3259216547012329,
"learning_rate": 5.947929538755006e-05,
"loss": 1.836,
"step": 1455
},
{
"epoch": 0.3516058922965467,
"grad_norm": 0.3077600300312042,
"learning_rate": 5.94522426916787e-05,
"loss": 1.7187,
"step": 1456
},
{
"epoch": 0.3518473798599372,
"grad_norm": 0.3284499943256378,
"learning_rate": 5.942517833723644e-05,
"loss": 1.8225,
"step": 1457
},
{
"epoch": 0.3520888674233277,
"grad_norm": 0.31834086775779724,
"learning_rate": 5.939810234044413e-05,
"loss": 1.7048,
"step": 1458
},
{
"epoch": 0.35233035498671816,
"grad_norm": 0.3011278510093689,
"learning_rate": 5.937101471752961e-05,
"loss": 1.6252,
"step": 1459
},
{
"epoch": 0.35257184255010865,
"grad_norm": 0.31385111808776855,
"learning_rate": 5.934391548472763e-05,
"loss": 1.6818,
"step": 1460
},
{
"epoch": 0.35281333011349914,
"grad_norm": 0.32963138818740845,
"learning_rate": 5.931680465827995e-05,
"loss": 1.939,
"step": 1461
},
{
"epoch": 0.3530548176768896,
"grad_norm": 0.3022247552871704,
"learning_rate": 5.928968225443526e-05,
"loss": 1.7759,
"step": 1462
},
{
"epoch": 0.3532963052402801,
"grad_norm": 0.30561262369155884,
"learning_rate": 5.9262548289449185e-05,
"loss": 1.6501,
"step": 1463
},
{
"epoch": 0.3535377928036706,
"grad_norm": 0.3121855556964874,
"learning_rate": 5.9235402779584294e-05,
"loss": 1.7566,
"step": 1464
},
{
"epoch": 0.3537792803670611,
"grad_norm": 0.32116931676864624,
"learning_rate": 5.920824574111006e-05,
"loss": 1.726,
"step": 1465
},
{
"epoch": 0.35402076793045156,
"grad_norm": 0.29525837302207947,
"learning_rate": 5.918107719030287e-05,
"loss": 1.6163,
"step": 1466
},
{
"epoch": 0.35426225549384205,
"grad_norm": 0.3194003999233246,
"learning_rate": 5.9153897143446014e-05,
"loss": 1.7976,
"step": 1467
},
{
"epoch": 0.35450374305723253,
"grad_norm": 0.31026211380958557,
"learning_rate": 5.912670561682968e-05,
"loss": 1.7198,
"step": 1468
},
{
"epoch": 0.354745230620623,
"grad_norm": 0.31474968791007996,
"learning_rate": 5.9099502626750914e-05,
"loss": 1.6546,
"step": 1469
},
{
"epoch": 0.3549867181840135,
"grad_norm": 0.3168904185295105,
"learning_rate": 5.907228818951364e-05,
"loss": 1.7855,
"step": 1470
},
{
"epoch": 0.355228205747404,
"grad_norm": 0.33451682329177856,
"learning_rate": 5.9045062321428665e-05,
"loss": 1.7105,
"step": 1471
},
{
"epoch": 0.3554696933107945,
"grad_norm": 0.3296138048171997,
"learning_rate": 5.901782503881363e-05,
"loss": 1.847,
"step": 1472
},
{
"epoch": 0.35571118087418496,
"grad_norm": 0.29878273606300354,
"learning_rate": 5.899057635799299e-05,
"loss": 1.6533,
"step": 1473
},
{
"epoch": 0.35595266843757545,
"grad_norm": 0.33155831694602966,
"learning_rate": 5.896331629529809e-05,
"loss": 1.9612,
"step": 1474
},
{
"epoch": 0.35619415600096593,
"grad_norm": 0.3336942493915558,
"learning_rate": 5.893604486706705e-05,
"loss": 1.8694,
"step": 1475
},
{
"epoch": 0.3564356435643564,
"grad_norm": 0.32858115434646606,
"learning_rate": 5.890876208964482e-05,
"loss": 1.8195,
"step": 1476
},
{
"epoch": 0.3566771311277469,
"grad_norm": 0.3218596577644348,
"learning_rate": 5.888146797938316e-05,
"loss": 1.8801,
"step": 1477
},
{
"epoch": 0.3569186186911374,
"grad_norm": 0.31268176436424255,
"learning_rate": 5.885416255264059e-05,
"loss": 1.6911,
"step": 1478
},
{
"epoch": 0.3571601062545279,
"grad_norm": 0.32213079929351807,
"learning_rate": 5.882684582578244e-05,
"loss": 1.8121,
"step": 1479
},
{
"epoch": 0.35740159381791836,
"grad_norm": 0.32161325216293335,
"learning_rate": 5.879951781518083e-05,
"loss": 1.7864,
"step": 1480
},
{
"epoch": 0.35764308138130885,
"grad_norm": 0.32209402322769165,
"learning_rate": 5.8772178537214586e-05,
"loss": 1.8956,
"step": 1481
},
{
"epoch": 0.35788456894469933,
"grad_norm": 0.3203023374080658,
"learning_rate": 5.8744828008269336e-05,
"loss": 1.7863,
"step": 1482
},
{
"epoch": 0.3581260565080898,
"grad_norm": 0.3110294044017792,
"learning_rate": 5.871746624473744e-05,
"loss": 1.7236,
"step": 1483
},
{
"epoch": 0.3583675440714803,
"grad_norm": 0.3175148665904999,
"learning_rate": 5.8690093263017984e-05,
"loss": 1.8843,
"step": 1484
},
{
"epoch": 0.3586090316348708,
"grad_norm": 0.31239208579063416,
"learning_rate": 5.866270907951678e-05,
"loss": 1.7412,
"step": 1485
},
{
"epoch": 0.3588505191982613,
"grad_norm": 0.30664995312690735,
"learning_rate": 5.863531371064634e-05,
"loss": 1.7344,
"step": 1486
},
{
"epoch": 0.35909200676165176,
"grad_norm": 0.3216778635978699,
"learning_rate": 5.8607907172825923e-05,
"loss": 1.8317,
"step": 1487
},
{
"epoch": 0.35933349432504225,
"grad_norm": 0.3176087737083435,
"learning_rate": 5.858048948248143e-05,
"loss": 1.778,
"step": 1488
},
{
"epoch": 0.35957498188843273,
"grad_norm": 0.31520044803619385,
"learning_rate": 5.855306065604548e-05,
"loss": 1.6223,
"step": 1489
},
{
"epoch": 0.3598164694518232,
"grad_norm": 0.33666151762008667,
"learning_rate": 5.852562070995735e-05,
"loss": 1.8668,
"step": 1490
},
{
"epoch": 0.3600579570152137,
"grad_norm": 0.3103683292865753,
"learning_rate": 5.849816966066298e-05,
"loss": 1.8146,
"step": 1491
},
{
"epoch": 0.3602994445786042,
"grad_norm": 0.32813334465026855,
"learning_rate": 5.8470707524615e-05,
"loss": 1.9376,
"step": 1492
},
{
"epoch": 0.3605409321419947,
"grad_norm": 0.33966293931007385,
"learning_rate": 5.844323431827263e-05,
"loss": 2.0089,
"step": 1493
},
{
"epoch": 0.36078241970538516,
"grad_norm": 0.29662173986434937,
"learning_rate": 5.8415750058101765e-05,
"loss": 1.6096,
"step": 1494
},
{
"epoch": 0.36102390726877565,
"grad_norm": 0.3011605441570282,
"learning_rate": 5.83882547605749e-05,
"loss": 1.6289,
"step": 1495
},
{
"epoch": 0.36126539483216613,
"grad_norm": 0.3044760227203369,
"learning_rate": 5.8360748442171164e-05,
"loss": 1.7737,
"step": 1496
},
{
"epoch": 0.3615068823955566,
"grad_norm": 0.31246650218963623,
"learning_rate": 5.833323111937629e-05,
"loss": 1.7599,
"step": 1497
},
{
"epoch": 0.3617483699589471,
"grad_norm": 0.30395039916038513,
"learning_rate": 5.830570280868258e-05,
"loss": 1.6438,
"step": 1498
},
{
"epoch": 0.3619898575223376,
"grad_norm": 0.3342861831188202,
"learning_rate": 5.827816352658896e-05,
"loss": 1.9117,
"step": 1499
},
{
"epoch": 0.3622313450857281,
"grad_norm": 0.3126901090145111,
"learning_rate": 5.825061328960091e-05,
"loss": 1.8322,
"step": 1500
},
{
"epoch": 0.36247283264911856,
"grad_norm": 0.325332909822464,
"learning_rate": 5.822305211423049e-05,
"loss": 1.8047,
"step": 1501
},
{
"epoch": 0.36271432021250904,
"grad_norm": 0.3215937614440918,
"learning_rate": 5.819548001699628e-05,
"loss": 1.8229,
"step": 1502
},
{
"epoch": 0.36295580777589953,
"grad_norm": 0.32431450486183167,
"learning_rate": 5.816789701442345e-05,
"loss": 1.7385,
"step": 1503
},
{
"epoch": 0.36319729533929,
"grad_norm": 0.3194507956504822,
"learning_rate": 5.8140303123043676e-05,
"loss": 1.6355,
"step": 1504
},
{
"epoch": 0.3634387829026805,
"grad_norm": 0.31866469979286194,
"learning_rate": 5.811269835939518e-05,
"loss": 1.7696,
"step": 1505
},
{
"epoch": 0.363680270466071,
"grad_norm": 0.30973389744758606,
"learning_rate": 5.808508274002269e-05,
"loss": 1.6875,
"step": 1506
},
{
"epoch": 0.3639217580294615,
"grad_norm": 0.31541547179222107,
"learning_rate": 5.805745628147744e-05,
"loss": 1.6931,
"step": 1507
},
{
"epoch": 0.36416324559285196,
"grad_norm": 0.31543099880218506,
"learning_rate": 5.802981900031716e-05,
"loss": 1.7594,
"step": 1508
},
{
"epoch": 0.36440473315624244,
"grad_norm": 0.3169846832752228,
"learning_rate": 5.8002170913106074e-05,
"loss": 1.8439,
"step": 1509
},
{
"epoch": 0.36464622071963293,
"grad_norm": 0.31679767370224,
"learning_rate": 5.797451203641488e-05,
"loss": 1.7327,
"step": 1510
},
{
"epoch": 0.3648877082830234,
"grad_norm": 0.30597200989723206,
"learning_rate": 5.794684238682072e-05,
"loss": 1.68,
"step": 1511
},
{
"epoch": 0.3651291958464139,
"grad_norm": 0.36071524024009705,
"learning_rate": 5.7919161980907236e-05,
"loss": 1.9643,
"step": 1512
},
{
"epoch": 0.3653706834098044,
"grad_norm": 0.306130975484848,
"learning_rate": 5.789147083526449e-05,
"loss": 1.5648,
"step": 1513
},
{
"epoch": 0.36561217097319487,
"grad_norm": 0.3169862926006317,
"learning_rate": 5.7863768966488966e-05,
"loss": 1.7462,
"step": 1514
},
{
"epoch": 0.36585365853658536,
"grad_norm": 0.31784337759017944,
"learning_rate": 5.783605639118362e-05,
"loss": 1.749,
"step": 1515
},
{
"epoch": 0.36609514609997584,
"grad_norm": 0.33231326937675476,
"learning_rate": 5.780833312595777e-05,
"loss": 1.6741,
"step": 1516
},
{
"epoch": 0.36633663366336633,
"grad_norm": 0.3198108673095703,
"learning_rate": 5.7780599187427186e-05,
"loss": 1.8175,
"step": 1517
},
{
"epoch": 0.3665781212267568,
"grad_norm": 0.32270848751068115,
"learning_rate": 5.775285459221401e-05,
"loss": 1.8811,
"step": 1518
},
{
"epoch": 0.3668196087901473,
"grad_norm": 0.328346848487854,
"learning_rate": 5.772509935694678e-05,
"loss": 1.6793,
"step": 1519
},
{
"epoch": 0.3670610963535378,
"grad_norm": 0.29311639070510864,
"learning_rate": 5.7697333498260414e-05,
"loss": 1.6357,
"step": 1520
},
{
"epoch": 0.36730258391692827,
"grad_norm": 0.3062235414981842,
"learning_rate": 5.7669557032796184e-05,
"loss": 1.6904,
"step": 1521
},
{
"epoch": 0.36754407148031876,
"grad_norm": 0.3087918162345886,
"learning_rate": 5.764176997720175e-05,
"loss": 1.7203,
"step": 1522
},
{
"epoch": 0.36778555904370924,
"grad_norm": 0.2941713333129883,
"learning_rate": 5.761397234813106e-05,
"loss": 1.5707,
"step": 1523
},
{
"epoch": 0.36802704660709973,
"grad_norm": 0.3183874487876892,
"learning_rate": 5.7586164162244474e-05,
"loss": 1.7364,
"step": 1524
},
{
"epoch": 0.3682685341704902,
"grad_norm": 0.2950633466243744,
"learning_rate": 5.7558345436208616e-05,
"loss": 1.5168,
"step": 1525
},
{
"epoch": 0.3685100217338807,
"grad_norm": 0.3116483986377716,
"learning_rate": 5.753051618669646e-05,
"loss": 1.8518,
"step": 1526
},
{
"epoch": 0.3687515092972712,
"grad_norm": 0.3113264739513397,
"learning_rate": 5.7502676430387275e-05,
"loss": 1.7688,
"step": 1527
},
{
"epoch": 0.36899299686066167,
"grad_norm": 0.3159504532814026,
"learning_rate": 5.747482618396666e-05,
"loss": 1.7198,
"step": 1528
},
{
"epoch": 0.36923448442405216,
"grad_norm": 0.34909993410110474,
"learning_rate": 5.744696546412642e-05,
"loss": 1.8096,
"step": 1529
},
{
"epoch": 0.36947597198744264,
"grad_norm": 0.31155431270599365,
"learning_rate": 5.741909428756473e-05,
"loss": 1.7383,
"step": 1530
},
{
"epoch": 0.3697174595508331,
"grad_norm": 0.3173414468765259,
"learning_rate": 5.7391212670985985e-05,
"loss": 1.8322,
"step": 1531
},
{
"epoch": 0.3699589471142236,
"grad_norm": 0.3144669830799103,
"learning_rate": 5.736332063110084e-05,
"loss": 1.7064,
"step": 1532
},
{
"epoch": 0.3702004346776141,
"grad_norm": 0.31089121103286743,
"learning_rate": 5.733541818462621e-05,
"loss": 1.6687,
"step": 1533
},
{
"epoch": 0.3704419222410046,
"grad_norm": 0.3142034411430359,
"learning_rate": 5.7307505348285216e-05,
"loss": 1.6096,
"step": 1534
},
{
"epoch": 0.37068340980439507,
"grad_norm": 0.3087711036205292,
"learning_rate": 5.7279582138807264e-05,
"loss": 1.6961,
"step": 1535
},
{
"epoch": 0.37092489736778556,
"grad_norm": 0.2999480664730072,
"learning_rate": 5.725164857292791e-05,
"loss": 1.6879,
"step": 1536
},
{
"epoch": 0.37116638493117604,
"grad_norm": 0.32123640179634094,
"learning_rate": 5.7223704667388965e-05,
"loss": 1.8965,
"step": 1537
},
{
"epoch": 0.3714078724945665,
"grad_norm": 0.29996874928474426,
"learning_rate": 5.719575043893842e-05,
"loss": 1.6886,
"step": 1538
},
{
"epoch": 0.371649360057957,
"grad_norm": 0.32067954540252686,
"learning_rate": 5.716778590433045e-05,
"loss": 1.7657,
"step": 1539
},
{
"epoch": 0.3718908476213475,
"grad_norm": 0.3116958737373352,
"learning_rate": 5.713981108032542e-05,
"loss": 1.7947,
"step": 1540
},
{
"epoch": 0.372132335184738,
"grad_norm": 0.30600202083587646,
"learning_rate": 5.711182598368983e-05,
"loss": 1.7915,
"step": 1541
},
{
"epoch": 0.37237382274812847,
"grad_norm": 0.3116418421268463,
"learning_rate": 5.7083830631196375e-05,
"loss": 1.6921,
"step": 1542
},
{
"epoch": 0.37261531031151895,
"grad_norm": 0.31770211458206177,
"learning_rate": 5.705582503962388e-05,
"loss": 1.6573,
"step": 1543
},
{
"epoch": 0.37285679787490944,
"grad_norm": 0.31877562403678894,
"learning_rate": 5.702780922575733e-05,
"loss": 1.8058,
"step": 1544
},
{
"epoch": 0.3730982854382999,
"grad_norm": 0.32386425137519836,
"learning_rate": 5.699978320638777e-05,
"loss": 1.8911,
"step": 1545
},
{
"epoch": 0.3733397730016904,
"grad_norm": 0.31875795125961304,
"learning_rate": 5.697174699831244e-05,
"loss": 1.6746,
"step": 1546
},
{
"epoch": 0.3735812605650809,
"grad_norm": 0.30674871802330017,
"learning_rate": 5.694370061833464e-05,
"loss": 1.6765,
"step": 1547
},
{
"epoch": 0.3738227481284714,
"grad_norm": 0.3357049822807312,
"learning_rate": 5.691564408326379e-05,
"loss": 1.7836,
"step": 1548
},
{
"epoch": 0.37406423569186187,
"grad_norm": 0.318651020526886,
"learning_rate": 5.688757740991537e-05,
"loss": 1.6588,
"step": 1549
},
{
"epoch": 0.37430572325525235,
"grad_norm": 0.3196345567703247,
"learning_rate": 5.6859500615110956e-05,
"loss": 1.7283,
"step": 1550
},
{
"epoch": 0.37454721081864284,
"grad_norm": 0.35835352540016174,
"learning_rate": 5.6831413715678197e-05,
"loss": 1.8224,
"step": 1551
},
{
"epoch": 0.3747886983820333,
"grad_norm": 0.30183541774749756,
"learning_rate": 5.680331672845078e-05,
"loss": 1.6194,
"step": 1552
},
{
"epoch": 0.3750301859454238,
"grad_norm": 0.318406343460083,
"learning_rate": 5.6775209670268436e-05,
"loss": 1.7579,
"step": 1553
},
{
"epoch": 0.3752716735088143,
"grad_norm": 0.3073185980319977,
"learning_rate": 5.6747092557976966e-05,
"loss": 1.6283,
"step": 1554
},
{
"epoch": 0.3755131610722048,
"grad_norm": 0.3035070598125458,
"learning_rate": 5.671896540842815e-05,
"loss": 1.7404,
"step": 1555
},
{
"epoch": 0.37575464863559527,
"grad_norm": 0.3201872706413269,
"learning_rate": 5.66908282384798e-05,
"loss": 1.8483,
"step": 1556
},
{
"epoch": 0.37599613619898575,
"grad_norm": 0.3132006525993347,
"learning_rate": 5.6662681064995776e-05,
"loss": 1.696,
"step": 1557
},
{
"epoch": 0.37623762376237624,
"grad_norm": 0.31123900413513184,
"learning_rate": 5.663452390484586e-05,
"loss": 1.6547,
"step": 1558
},
{
"epoch": 0.3764791113257667,
"grad_norm": 0.3195400834083557,
"learning_rate": 5.660635677490587e-05,
"loss": 1.7253,
"step": 1559
},
{
"epoch": 0.3767205988891572,
"grad_norm": 0.30685698986053467,
"learning_rate": 5.657817969205759e-05,
"loss": 1.6781,
"step": 1560
},
{
"epoch": 0.3769620864525477,
"grad_norm": 0.3170833885669708,
"learning_rate": 5.654999267318877e-05,
"loss": 1.7026,
"step": 1561
},
{
"epoch": 0.3772035740159382,
"grad_norm": 0.3388971984386444,
"learning_rate": 5.652179573519309e-05,
"loss": 1.763,
"step": 1562
},
{
"epoch": 0.37744506157932867,
"grad_norm": 0.33410897850990295,
"learning_rate": 5.6493588894970205e-05,
"loss": 1.7048,
"step": 1563
},
{
"epoch": 0.37768654914271915,
"grad_norm": 0.3090329170227051,
"learning_rate": 5.646537216942571e-05,
"loss": 1.6734,
"step": 1564
},
{
"epoch": 0.37792803670610964,
"grad_norm": 0.3300420343875885,
"learning_rate": 5.6437145575471086e-05,
"loss": 1.8244,
"step": 1565
},
{
"epoch": 0.3781695242695001,
"grad_norm": 0.34012481570243835,
"learning_rate": 5.640890913002377e-05,
"loss": 1.827,
"step": 1566
},
{
"epoch": 0.3784110118328906,
"grad_norm": 0.31980302929878235,
"learning_rate": 5.638066285000708e-05,
"loss": 1.685,
"step": 1567
},
{
"epoch": 0.3786524993962811,
"grad_norm": 0.32107704877853394,
"learning_rate": 5.6352406752350225e-05,
"loss": 1.8356,
"step": 1568
},
{
"epoch": 0.3788939869596716,
"grad_norm": 0.3096439838409424,
"learning_rate": 5.632414085398832e-05,
"loss": 1.5647,
"step": 1569
},
{
"epoch": 0.37913547452306207,
"grad_norm": 0.3330332934856415,
"learning_rate": 5.6295865171862357e-05,
"loss": 1.7864,
"step": 1570
},
{
"epoch": 0.37937696208645255,
"grad_norm": 0.34433725476264954,
"learning_rate": 5.6267579722919126e-05,
"loss": 1.8842,
"step": 1571
},
{
"epoch": 0.37961844964984304,
"grad_norm": 0.3113875091075897,
"learning_rate": 5.623928452411136e-05,
"loss": 1.8177,
"step": 1572
},
{
"epoch": 0.3798599372132335,
"grad_norm": 0.32041990756988525,
"learning_rate": 5.621097959239759e-05,
"loss": 1.7717,
"step": 1573
},
{
"epoch": 0.380101424776624,
"grad_norm": 0.3251771628856659,
"learning_rate": 5.618266494474218e-05,
"loss": 1.7525,
"step": 1574
},
{
"epoch": 0.3803429123400145,
"grad_norm": 0.3050212860107422,
"learning_rate": 5.6154340598115316e-05,
"loss": 1.6738,
"step": 1575
},
{
"epoch": 0.380584399903405,
"grad_norm": 0.3157691955566406,
"learning_rate": 5.612600656949302e-05,
"loss": 1.748,
"step": 1576
},
{
"epoch": 0.38082588746679547,
"grad_norm": 0.340025931596756,
"learning_rate": 5.609766287585711e-05,
"loss": 1.8142,
"step": 1577
},
{
"epoch": 0.38106737503018595,
"grad_norm": 0.3133496046066284,
"learning_rate": 5.606930953419517e-05,
"loss": 1.6432,
"step": 1578
},
{
"epoch": 0.38130886259357644,
"grad_norm": 0.3089030683040619,
"learning_rate": 5.6040946561500594e-05,
"loss": 1.6171,
"step": 1579
},
{
"epoch": 0.3815503501569669,
"grad_norm": 0.31117933988571167,
"learning_rate": 5.601257397477252e-05,
"loss": 1.7193,
"step": 1580
},
{
"epoch": 0.3817918377203574,
"grad_norm": 0.31880703568458557,
"learning_rate": 5.59841917910159e-05,
"loss": 1.7253,
"step": 1581
},
{
"epoch": 0.3820333252837479,
"grad_norm": 0.3133091330528259,
"learning_rate": 5.595580002724137e-05,
"loss": 1.522,
"step": 1582
},
{
"epoch": 0.3822748128471384,
"grad_norm": 0.34288087487220764,
"learning_rate": 5.592739870046537e-05,
"loss": 1.8463,
"step": 1583
},
{
"epoch": 0.38251630041052886,
"grad_norm": 0.3295765817165375,
"learning_rate": 5.589898782771004e-05,
"loss": 1.8319,
"step": 1584
},
{
"epoch": 0.38275778797391935,
"grad_norm": 0.3100754916667938,
"learning_rate": 5.587056742600322e-05,
"loss": 1.6536,
"step": 1585
},
{
"epoch": 0.38299927553730984,
"grad_norm": 0.3132288157939911,
"learning_rate": 5.5842137512378524e-05,
"loss": 1.6085,
"step": 1586
},
{
"epoch": 0.3832407631007003,
"grad_norm": 0.3163909912109375,
"learning_rate": 5.5813698103875206e-05,
"loss": 1.5761,
"step": 1587
},
{
"epoch": 0.3834822506640908,
"grad_norm": 0.3432241976261139,
"learning_rate": 5.578524921753824e-05,
"loss": 1.6101,
"step": 1588
},
{
"epoch": 0.3837237382274813,
"grad_norm": 0.307777464389801,
"learning_rate": 5.5756790870418274e-05,
"loss": 1.7152,
"step": 1589
},
{
"epoch": 0.3839652257908718,
"grad_norm": 0.31681734323501587,
"learning_rate": 5.572832307957163e-05,
"loss": 1.7113,
"step": 1590
},
{
"epoch": 0.38420671335426226,
"grad_norm": 0.33259811997413635,
"learning_rate": 5.569984586206028e-05,
"loss": 1.6767,
"step": 1591
},
{
"epoch": 0.38444820091765275,
"grad_norm": 0.32139548659324646,
"learning_rate": 5.567135923495187e-05,
"loss": 1.8471,
"step": 1592
},
{
"epoch": 0.38468968848104323,
"grad_norm": 0.3762575089931488,
"learning_rate": 5.564286321531965e-05,
"loss": 1.8994,
"step": 1593
},
{
"epoch": 0.3849311760444337,
"grad_norm": 0.32005301117897034,
"learning_rate": 5.5614357820242525e-05,
"loss": 1.6572,
"step": 1594
},
{
"epoch": 0.3851726636078242,
"grad_norm": 0.3230658769607544,
"learning_rate": 5.558584306680501e-05,
"loss": 1.7142,
"step": 1595
},
{
"epoch": 0.3854141511712147,
"grad_norm": 0.31494832038879395,
"learning_rate": 5.5557318972097226e-05,
"loss": 1.7121,
"step": 1596
},
{
"epoch": 0.3856556387346052,
"grad_norm": 0.31691285967826843,
"learning_rate": 5.552878555321491e-05,
"loss": 1.707,
"step": 1597
},
{
"epoch": 0.38589712629799566,
"grad_norm": 0.3042242228984833,
"learning_rate": 5.550024282725936e-05,
"loss": 1.6972,
"step": 1598
},
{
"epoch": 0.38613861386138615,
"grad_norm": 0.30469492077827454,
"learning_rate": 5.5471690811337494e-05,
"loss": 1.6826,
"step": 1599
},
{
"epoch": 0.38638010142477663,
"grad_norm": 0.31376826763153076,
"learning_rate": 5.5443129522561734e-05,
"loss": 1.6751,
"step": 1600
},
{
"epoch": 0.3866215889881671,
"grad_norm": 0.31754270195961,
"learning_rate": 5.541455897805012e-05,
"loss": 1.737,
"step": 1601
},
{
"epoch": 0.3868630765515576,
"grad_norm": 0.3262483775615692,
"learning_rate": 5.538597919492621e-05,
"loss": 1.5888,
"step": 1602
},
{
"epoch": 0.3871045641149481,
"grad_norm": 0.32402339577674866,
"learning_rate": 5.53573901903191e-05,
"loss": 1.7864,
"step": 1603
},
{
"epoch": 0.3873460516783386,
"grad_norm": 0.321544349193573,
"learning_rate": 5.5328791981363435e-05,
"loss": 1.7058,
"step": 1604
},
{
"epoch": 0.38758753924172906,
"grad_norm": 0.31502535939216614,
"learning_rate": 5.530018458519935e-05,
"loss": 1.7887,
"step": 1605
},
{
"epoch": 0.38782902680511955,
"grad_norm": 0.30999353528022766,
"learning_rate": 5.5271568018972474e-05,
"loss": 1.7674,
"step": 1606
},
{
"epoch": 0.38807051436851003,
"grad_norm": 0.31182703375816345,
"learning_rate": 5.5242942299833984e-05,
"loss": 1.7194,
"step": 1607
},
{
"epoch": 0.3883120019319005,
"grad_norm": 0.31964096426963806,
"learning_rate": 5.5214307444940495e-05,
"loss": 1.6184,
"step": 1608
},
{
"epoch": 0.388553489495291,
"grad_norm": 0.3312462866306305,
"learning_rate": 5.5185663471454115e-05,
"loss": 1.7521,
"step": 1609
},
{
"epoch": 0.3887949770586815,
"grad_norm": 0.3217445909976959,
"learning_rate": 5.515701039654243e-05,
"loss": 1.7388,
"step": 1610
},
{
"epoch": 0.389036464622072,
"grad_norm": 0.3201799690723419,
"learning_rate": 5.512834823737846e-05,
"loss": 1.7771,
"step": 1611
},
{
"epoch": 0.38927795218546246,
"grad_norm": 0.3134850561618805,
"learning_rate": 5.509967701114068e-05,
"loss": 1.7415,
"step": 1612
},
{
"epoch": 0.38951943974885295,
"grad_norm": 0.3229968845844269,
"learning_rate": 5.5070996735013e-05,
"loss": 1.8011,
"step": 1613
},
{
"epoch": 0.38976092731224343,
"grad_norm": 0.3218373656272888,
"learning_rate": 5.5042307426184735e-05,
"loss": 1.7577,
"step": 1614
},
{
"epoch": 0.3900024148756339,
"grad_norm": 0.3155001997947693,
"learning_rate": 5.501360910185063e-05,
"loss": 1.7679,
"step": 1615
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.3090244233608246,
"learning_rate": 5.4984901779210855e-05,
"loss": 1.6268,
"step": 1616
},
{
"epoch": 0.3904853900024149,
"grad_norm": 0.3192291557788849,
"learning_rate": 5.495618547547094e-05,
"loss": 1.8372,
"step": 1617
},
{
"epoch": 0.3907268775658054,
"grad_norm": 0.31249117851257324,
"learning_rate": 5.4927460207841796e-05,
"loss": 1.8075,
"step": 1618
},
{
"epoch": 0.39096836512919586,
"grad_norm": 0.3381814658641815,
"learning_rate": 5.4898725993539735e-05,
"loss": 1.9058,
"step": 1619
},
{
"epoch": 0.39120985269258635,
"grad_norm": 0.31739556789398193,
"learning_rate": 5.48699828497864e-05,
"loss": 1.8154,
"step": 1620
},
{
"epoch": 0.39145134025597683,
"grad_norm": 0.3291226029396057,
"learning_rate": 5.484123079380882e-05,
"loss": 1.7774,
"step": 1621
},
{
"epoch": 0.3916928278193673,
"grad_norm": 0.30211007595062256,
"learning_rate": 5.4812469842839334e-05,
"loss": 1.6932,
"step": 1622
},
{
"epoch": 0.3919343153827578,
"grad_norm": 0.3263416886329651,
"learning_rate": 5.478370001411564e-05,
"loss": 1.7078,
"step": 1623
},
{
"epoch": 0.3921758029461483,
"grad_norm": 0.3306402266025543,
"learning_rate": 5.475492132488072e-05,
"loss": 1.8144,
"step": 1624
},
{
"epoch": 0.3924172905095388,
"grad_norm": 0.31025224924087524,
"learning_rate": 5.472613379238289e-05,
"loss": 1.7594,
"step": 1625
},
{
"epoch": 0.39265877807292926,
"grad_norm": 0.31240203976631165,
"learning_rate": 5.4697337433875785e-05,
"loss": 1.7538,
"step": 1626
},
{
"epoch": 0.39290026563631975,
"grad_norm": 0.32786843180656433,
"learning_rate": 5.466853226661828e-05,
"loss": 1.7343,
"step": 1627
},
{
"epoch": 0.39314175319971023,
"grad_norm": 0.31915387511253357,
"learning_rate": 5.4639718307874576e-05,
"loss": 1.6627,
"step": 1628
},
{
"epoch": 0.3933832407631007,
"grad_norm": 0.3256676495075226,
"learning_rate": 5.461089557491413e-05,
"loss": 1.6906,
"step": 1629
},
{
"epoch": 0.3936247283264912,
"grad_norm": 0.33956941962242126,
"learning_rate": 5.4582064085011644e-05,
"loss": 1.7723,
"step": 1630
},
{
"epoch": 0.3938662158898817,
"grad_norm": 0.32009264826774597,
"learning_rate": 5.455322385544707e-05,
"loss": 1.6601,
"step": 1631
},
{
"epoch": 0.3941077034532722,
"grad_norm": 0.3323977291584015,
"learning_rate": 5.452437490350562e-05,
"loss": 1.8277,
"step": 1632
},
{
"epoch": 0.39434919101666266,
"grad_norm": 0.3169059753417969,
"learning_rate": 5.449551724647772e-05,
"loss": 1.7505,
"step": 1633
},
{
"epoch": 0.39459067858005314,
"grad_norm": 0.3227306306362152,
"learning_rate": 5.446665090165901e-05,
"loss": 1.9677,
"step": 1634
},
{
"epoch": 0.39483216614344363,
"grad_norm": 0.32162293791770935,
"learning_rate": 5.4437775886350334e-05,
"loss": 1.7486,
"step": 1635
},
{
"epoch": 0.3950736537068341,
"grad_norm": 0.3121008574962616,
"learning_rate": 5.440889221785773e-05,
"loss": 1.6298,
"step": 1636
},
{
"epoch": 0.3953151412702246,
"grad_norm": 0.3139210343360901,
"learning_rate": 5.437999991349246e-05,
"loss": 1.7676,
"step": 1637
},
{
"epoch": 0.3955566288336151,
"grad_norm": 0.30618348717689514,
"learning_rate": 5.43510989905709e-05,
"loss": 1.7309,
"step": 1638
},
{
"epoch": 0.3957981163970056,
"grad_norm": 0.325777143239975,
"learning_rate": 5.432218946641465e-05,
"loss": 1.6668,
"step": 1639
},
{
"epoch": 0.39603960396039606,
"grad_norm": 0.3241610527038574,
"learning_rate": 5.429327135835042e-05,
"loss": 1.6995,
"step": 1640
},
{
"epoch": 0.39628109152378654,
"grad_norm": 0.3215353786945343,
"learning_rate": 5.4264344683710096e-05,
"loss": 1.8294,
"step": 1641
},
{
"epoch": 0.39652257908717703,
"grad_norm": 0.3343597650527954,
"learning_rate": 5.4235409459830664e-05,
"loss": 1.7734,
"step": 1642
},
{
"epoch": 0.3967640666505675,
"grad_norm": 0.3067845404148102,
"learning_rate": 5.4206465704054295e-05,
"loss": 1.5428,
"step": 1643
},
{
"epoch": 0.397005554213958,
"grad_norm": 0.31020960211753845,
"learning_rate": 5.41775134337282e-05,
"loss": 1.7374,
"step": 1644
},
{
"epoch": 0.3972470417773485,
"grad_norm": 0.3085239827632904,
"learning_rate": 5.414855266620475e-05,
"loss": 1.5923,
"step": 1645
},
{
"epoch": 0.39748852934073897,
"grad_norm": 0.30102473497390747,
"learning_rate": 5.411958341884137e-05,
"loss": 1.6841,
"step": 1646
},
{
"epoch": 0.39773001690412946,
"grad_norm": 0.32308852672576904,
"learning_rate": 5.4090605709000574e-05,
"loss": 1.8351,
"step": 1647
},
{
"epoch": 0.39797150446751994,
"grad_norm": 0.34821414947509766,
"learning_rate": 5.406161955405e-05,
"loss": 1.809,
"step": 1648
},
{
"epoch": 0.39821299203091043,
"grad_norm": 0.36567896604537964,
"learning_rate": 5.403262497136227e-05,
"loss": 1.7037,
"step": 1649
},
{
"epoch": 0.3984544795943009,
"grad_norm": 0.3330789804458618,
"learning_rate": 5.4003621978315095e-05,
"loss": 1.7455,
"step": 1650
},
{
"epoch": 0.3986959671576914,
"grad_norm": 0.3309069871902466,
"learning_rate": 5.3974610592291235e-05,
"loss": 1.9542,
"step": 1651
},
{
"epoch": 0.3989374547210819,
"grad_norm": 0.3199659287929535,
"learning_rate": 5.394559083067845e-05,
"loss": 1.683,
"step": 1652
},
{
"epoch": 0.39917894228447237,
"grad_norm": 0.3193099796772003,
"learning_rate": 5.3916562710869556e-05,
"loss": 1.6782,
"step": 1653
},
{
"epoch": 0.39942042984786286,
"grad_norm": 0.33188971877098083,
"learning_rate": 5.388752625026237e-05,
"loss": 1.6784,
"step": 1654
},
{
"epoch": 0.39966191741125334,
"grad_norm": 0.3204587399959564,
"learning_rate": 5.385848146625969e-05,
"loss": 1.7851,
"step": 1655
},
{
"epoch": 0.39990340497464383,
"grad_norm": 0.33102720975875854,
"learning_rate": 5.38294283762693e-05,
"loss": 1.783,
"step": 1656
},
{
"epoch": 0.4001448925380343,
"grad_norm": 0.33277207612991333,
"learning_rate": 5.380036699770399e-05,
"loss": 1.858,
"step": 1657
},
{
"epoch": 0.4003863801014248,
"grad_norm": 0.3018147945404053,
"learning_rate": 5.377129734798149e-05,
"loss": 1.6409,
"step": 1658
},
{
"epoch": 0.4006278676648153,
"grad_norm": 0.339412122964859,
"learning_rate": 5.3742219444524504e-05,
"loss": 1.7925,
"step": 1659
},
{
"epoch": 0.40086935522820577,
"grad_norm": 0.32353413105010986,
"learning_rate": 5.371313330476068e-05,
"loss": 1.6374,
"step": 1660
},
{
"epoch": 0.40111084279159626,
"grad_norm": 0.31544435024261475,
"learning_rate": 5.368403894612261e-05,
"loss": 1.7994,
"step": 1661
},
{
"epoch": 0.40135233035498674,
"grad_norm": 0.3048715889453888,
"learning_rate": 5.365493638604777e-05,
"loss": 1.7828,
"step": 1662
},
{
"epoch": 0.4015938179183772,
"grad_norm": 0.33015862107276917,
"learning_rate": 5.362582564197863e-05,
"loss": 1.7849,
"step": 1663
},
{
"epoch": 0.4018353054817677,
"grad_norm": 0.3231745660305023,
"learning_rate": 5.359670673136247e-05,
"loss": 1.5934,
"step": 1664
},
{
"epoch": 0.4020767930451582,
"grad_norm": 0.35362470149993896,
"learning_rate": 5.3567579671651544e-05,
"loss": 1.7968,
"step": 1665
},
{
"epoch": 0.4023182806085487,
"grad_norm": 0.3389405608177185,
"learning_rate": 5.353844448030297e-05,
"loss": 1.7623,
"step": 1666
},
{
"epoch": 0.40255976817193917,
"grad_norm": 0.32034578919410706,
"learning_rate": 5.35093011747787e-05,
"loss": 1.732,
"step": 1667
},
{
"epoch": 0.40280125573532966,
"grad_norm": 0.33826392889022827,
"learning_rate": 5.348014977254558e-05,
"loss": 1.8616,
"step": 1668
},
{
"epoch": 0.40304274329872014,
"grad_norm": 0.33229494094848633,
"learning_rate": 5.345099029107533e-05,
"loss": 1.8809,
"step": 1669
},
{
"epoch": 0.4032842308621106,
"grad_norm": 0.3166428506374359,
"learning_rate": 5.342182274784447e-05,
"loss": 1.7468,
"step": 1670
},
{
"epoch": 0.4035257184255011,
"grad_norm": 0.3228038251399994,
"learning_rate": 5.339264716033438e-05,
"loss": 1.6577,
"step": 1671
},
{
"epoch": 0.4037672059888916,
"grad_norm": 0.30518126487731934,
"learning_rate": 5.336346354603125e-05,
"loss": 1.7055,
"step": 1672
},
{
"epoch": 0.4040086935522821,
"grad_norm": 0.32699069380760193,
"learning_rate": 5.3334271922426085e-05,
"loss": 1.6633,
"step": 1673
},
{
"epoch": 0.40425018111567257,
"grad_norm": 0.32846981287002563,
"learning_rate": 5.3305072307014684e-05,
"loss": 1.7801,
"step": 1674
},
{
"epoch": 0.40449166867906305,
"grad_norm": 0.3315163254737854,
"learning_rate": 5.3275864717297624e-05,
"loss": 1.8734,
"step": 1675
},
{
"epoch": 0.40473315624245354,
"grad_norm": 0.32653379440307617,
"learning_rate": 5.324664917078032e-05,
"loss": 1.8171,
"step": 1676
},
{
"epoch": 0.404974643805844,
"grad_norm": 0.310324102640152,
"learning_rate": 5.3217425684972876e-05,
"loss": 1.6035,
"step": 1677
},
{
"epoch": 0.4052161313692345,
"grad_norm": 0.30552801489830017,
"learning_rate": 5.318819427739021e-05,
"loss": 1.5884,
"step": 1678
},
{
"epoch": 0.405457618932625,
"grad_norm": 0.31171873211860657,
"learning_rate": 5.315895496555197e-05,
"loss": 1.7287,
"step": 1679
},
{
"epoch": 0.4056991064960155,
"grad_norm": 0.4216386377811432,
"learning_rate": 5.312970776698252e-05,
"loss": 1.8202,
"step": 1680
},
{
"epoch": 0.40594059405940597,
"grad_norm": 0.30686837434768677,
"learning_rate": 5.3100452699211e-05,
"loss": 1.6182,
"step": 1681
},
{
"epoch": 0.40618208162279645,
"grad_norm": 0.31551510095596313,
"learning_rate": 5.307118977977122e-05,
"loss": 1.7769,
"step": 1682
},
{
"epoch": 0.4064235691861869,
"grad_norm": 0.32668325304985046,
"learning_rate": 5.3041919026201714e-05,
"loss": 1.8593,
"step": 1683
},
{
"epoch": 0.40666505674957737,
"grad_norm": 0.3222865164279938,
"learning_rate": 5.301264045604573e-05,
"loss": 1.7289,
"step": 1684
},
{
"epoch": 0.40690654431296785,
"grad_norm": 0.319663941860199,
"learning_rate": 5.2983354086851146e-05,
"loss": 1.7866,
"step": 1685
},
{
"epoch": 0.40714803187635834,
"grad_norm": 0.3232978582382202,
"learning_rate": 5.295405993617059e-05,
"loss": 1.761,
"step": 1686
},
{
"epoch": 0.4073895194397488,
"grad_norm": 0.31206750869750977,
"learning_rate": 5.29247580215613e-05,
"loss": 1.5944,
"step": 1687
},
{
"epoch": 0.4076310070031393,
"grad_norm": 0.3296249508857727,
"learning_rate": 5.289544836058517e-05,
"loss": 1.7709,
"step": 1688
},
{
"epoch": 0.4078724945665298,
"grad_norm": 0.30123740434646606,
"learning_rate": 5.286613097080876e-05,
"loss": 1.6726,
"step": 1689
},
{
"epoch": 0.4081139821299203,
"grad_norm": 0.3386242091655731,
"learning_rate": 5.2836805869803255e-05,
"loss": 1.9382,
"step": 1690
},
{
"epoch": 0.40835546969331077,
"grad_norm": 0.3159593641757965,
"learning_rate": 5.2807473075144445e-05,
"loss": 1.6599,
"step": 1691
},
{
"epoch": 0.40859695725670125,
"grad_norm": 0.33741095662117004,
"learning_rate": 5.277813260441274e-05,
"loss": 1.9443,
"step": 1692
},
{
"epoch": 0.40883844482009174,
"grad_norm": 0.3497970998287201,
"learning_rate": 5.274878447519318e-05,
"loss": 1.9927,
"step": 1693
},
{
"epoch": 0.4090799323834822,
"grad_norm": 0.3178463578224182,
"learning_rate": 5.271942870507534e-05,
"loss": 1.5977,
"step": 1694
},
{
"epoch": 0.4093214199468727,
"grad_norm": 0.31562668085098267,
"learning_rate": 5.2690065311653416e-05,
"loss": 1.6623,
"step": 1695
},
{
"epoch": 0.4095629075102632,
"grad_norm": 0.320965975522995,
"learning_rate": 5.2660694312526154e-05,
"loss": 1.7709,
"step": 1696
},
{
"epoch": 0.4098043950736537,
"grad_norm": 0.31700804829597473,
"learning_rate": 5.263131572529688e-05,
"loss": 1.8144,
"step": 1697
},
{
"epoch": 0.41004588263704417,
"grad_norm": 0.3084293007850647,
"learning_rate": 5.260192956757343e-05,
"loss": 1.713,
"step": 1698
},
{
"epoch": 0.41028737020043465,
"grad_norm": 0.31365668773651123,
"learning_rate": 5.2572535856968225e-05,
"loss": 1.7754,
"step": 1699
},
{
"epoch": 0.41052885776382514,
"grad_norm": 0.3074451684951782,
"learning_rate": 5.254313461109816e-05,
"loss": 1.7289,
"step": 1700
},
{
"epoch": 0.4107703453272156,
"grad_norm": 0.31787192821502686,
"learning_rate": 5.251372584758471e-05,
"loss": 1.7623,
"step": 1701
},
{
"epoch": 0.4110118328906061,
"grad_norm": 0.3212306797504425,
"learning_rate": 5.2484309584053794e-05,
"loss": 1.7933,
"step": 1702
},
{
"epoch": 0.4112533204539966,
"grad_norm": 0.32138124108314514,
"learning_rate": 5.2454885838135846e-05,
"loss": 1.7146,
"step": 1703
},
{
"epoch": 0.4114948080173871,
"grad_norm": 0.3186517357826233,
"learning_rate": 5.242545462746581e-05,
"loss": 1.7416,
"step": 1704
},
{
"epoch": 0.41173629558077757,
"grad_norm": 0.29256436228752136,
"learning_rate": 5.2396015969683086e-05,
"loss": 1.5187,
"step": 1705
},
{
"epoch": 0.41197778314416805,
"grad_norm": 0.30244478583335876,
"learning_rate": 5.23665698824315e-05,
"loss": 1.7165,
"step": 1706
},
{
"epoch": 0.41221927070755854,
"grad_norm": 0.31398919224739075,
"learning_rate": 5.2337116383359415e-05,
"loss": 1.6597,
"step": 1707
},
{
"epoch": 0.412460758270949,
"grad_norm": 0.31446996331214905,
"learning_rate": 5.2307655490119546e-05,
"loss": 1.6449,
"step": 1708
},
{
"epoch": 0.4127022458343395,
"grad_norm": 0.2963344156742096,
"learning_rate": 5.227818722036911e-05,
"loss": 1.5533,
"step": 1709
},
{
"epoch": 0.41294373339773,
"grad_norm": 0.3177819550037384,
"learning_rate": 5.22487115917697e-05,
"loss": 1.7537,
"step": 1710
},
{
"epoch": 0.4131852209611205,
"grad_norm": 0.300102174282074,
"learning_rate": 5.221922862198735e-05,
"loss": 1.6307,
"step": 1711
},
{
"epoch": 0.41342670852451097,
"grad_norm": 0.30934983491897583,
"learning_rate": 5.218973832869247e-05,
"loss": 1.676,
"step": 1712
},
{
"epoch": 0.41366819608790145,
"grad_norm": 0.32400190830230713,
"learning_rate": 5.216024072955988e-05,
"loss": 1.7713,
"step": 1713
},
{
"epoch": 0.41390968365129194,
"grad_norm": 0.3195970356464386,
"learning_rate": 5.213073584226874e-05,
"loss": 1.7983,
"step": 1714
},
{
"epoch": 0.4141511712146824,
"grad_norm": 0.33358579874038696,
"learning_rate": 5.210122368450263e-05,
"loss": 1.7347,
"step": 1715
},
{
"epoch": 0.4143926587780729,
"grad_norm": 0.32983365654945374,
"learning_rate": 5.207170427394946e-05,
"loss": 1.8242,
"step": 1716
},
{
"epoch": 0.4146341463414634,
"grad_norm": 0.3056759238243103,
"learning_rate": 5.204217762830149e-05,
"loss": 1.7023,
"step": 1717
},
{
"epoch": 0.4148756339048539,
"grad_norm": 0.3775116205215454,
"learning_rate": 5.201264376525531e-05,
"loss": 1.5639,
"step": 1718
},
{
"epoch": 0.41511712146824437,
"grad_norm": 0.31459784507751465,
"learning_rate": 5.1983102702511846e-05,
"loss": 1.7042,
"step": 1719
},
{
"epoch": 0.41535860903163485,
"grad_norm": 0.331863135099411,
"learning_rate": 5.195355445777634e-05,
"loss": 1.8641,
"step": 1720
},
{
"epoch": 0.41560009659502534,
"grad_norm": 0.3138774335384369,
"learning_rate": 5.1923999048758324e-05,
"loss": 1.7186,
"step": 1721
},
{
"epoch": 0.4158415841584158,
"grad_norm": 0.30020132660865784,
"learning_rate": 5.1894436493171646e-05,
"loss": 1.6064,
"step": 1722
},
{
"epoch": 0.4160830717218063,
"grad_norm": 0.28870853781700134,
"learning_rate": 5.186486680873442e-05,
"loss": 1.3951,
"step": 1723
},
{
"epoch": 0.4163245592851968,
"grad_norm": 0.3199133276939392,
"learning_rate": 5.1835290013169025e-05,
"loss": 1.8676,
"step": 1724
},
{
"epoch": 0.4165660468485873,
"grad_norm": 0.3255309462547302,
"learning_rate": 5.180570612420214e-05,
"loss": 1.8917,
"step": 1725
},
{
"epoch": 0.41680753441197776,
"grad_norm": 0.3171748220920563,
"learning_rate": 5.1776115159564664e-05,
"loss": 1.7169,
"step": 1726
},
{
"epoch": 0.41704902197536825,
"grad_norm": 0.3134252727031708,
"learning_rate": 5.1746517136991706e-05,
"loss": 1.8052,
"step": 1727
},
{
"epoch": 0.41729050953875874,
"grad_norm": 0.34795841574668884,
"learning_rate": 5.171691207422269e-05,
"loss": 1.8521,
"step": 1728
},
{
"epoch": 0.4175319971021492,
"grad_norm": 0.32155516743659973,
"learning_rate": 5.168729998900118e-05,
"loss": 1.7458,
"step": 1729
},
{
"epoch": 0.4177734846655397,
"grad_norm": 0.3200514614582062,
"learning_rate": 5.165768089907501e-05,
"loss": 1.7987,
"step": 1730
},
{
"epoch": 0.4180149722289302,
"grad_norm": 0.3235030174255371,
"learning_rate": 5.162805482219615e-05,
"loss": 1.6807,
"step": 1731
},
{
"epoch": 0.4182564597923207,
"grad_norm": 0.3196185529232025,
"learning_rate": 5.159842177612081e-05,
"loss": 1.7257,
"step": 1732
},
{
"epoch": 0.41849794735571116,
"grad_norm": 0.3183957636356354,
"learning_rate": 5.1568781778609336e-05,
"loss": 1.8169,
"step": 1733
},
{
"epoch": 0.41873943491910165,
"grad_norm": 0.31911373138427734,
"learning_rate": 5.153913484742629e-05,
"loss": 1.6534,
"step": 1734
},
{
"epoch": 0.41898092248249214,
"grad_norm": 0.31826508045196533,
"learning_rate": 5.1509481000340345e-05,
"loss": 1.7524,
"step": 1735
},
{
"epoch": 0.4192224100458826,
"grad_norm": 0.2995862364768982,
"learning_rate": 5.147982025512434e-05,
"loss": 1.6639,
"step": 1736
},
{
"epoch": 0.4194638976092731,
"grad_norm": 0.34899625182151794,
"learning_rate": 5.1450152629555245e-05,
"loss": 1.7866,
"step": 1737
},
{
"epoch": 0.4197053851726636,
"grad_norm": 0.3255622684955597,
"learning_rate": 5.142047814141414e-05,
"loss": 1.7003,
"step": 1738
},
{
"epoch": 0.4199468727360541,
"grad_norm": 0.328663170337677,
"learning_rate": 5.139079680848623e-05,
"loss": 1.7505,
"step": 1739
},
{
"epoch": 0.42018836029944456,
"grad_norm": 0.29785701632499695,
"learning_rate": 5.136110864856084e-05,
"loss": 1.5607,
"step": 1740
},
{
"epoch": 0.42042984786283505,
"grad_norm": 0.3232966363430023,
"learning_rate": 5.133141367943136e-05,
"loss": 1.8571,
"step": 1741
},
{
"epoch": 0.42067133542622553,
"grad_norm": 0.29955655336380005,
"learning_rate": 5.130171191889526e-05,
"loss": 1.6468,
"step": 1742
},
{
"epoch": 0.420912822989616,
"grad_norm": 0.32290521264076233,
"learning_rate": 5.127200338475411e-05,
"loss": 1.9304,
"step": 1743
},
{
"epoch": 0.4211543105530065,
"grad_norm": 0.33338356018066406,
"learning_rate": 5.124228809481351e-05,
"loss": 1.9154,
"step": 1744
},
{
"epoch": 0.421395798116397,
"grad_norm": 0.32244834303855896,
"learning_rate": 5.1212566066883116e-05,
"loss": 1.7334,
"step": 1745
},
{
"epoch": 0.4216372856797875,
"grad_norm": 0.3112456202507019,
"learning_rate": 5.118283731877663e-05,
"loss": 1.7167,
"step": 1746
},
{
"epoch": 0.42187877324317796,
"grad_norm": 0.3183744251728058,
"learning_rate": 5.1153101868311776e-05,
"loss": 1.7666,
"step": 1747
},
{
"epoch": 0.42212026080656845,
"grad_norm": 0.3148494362831116,
"learning_rate": 5.1123359733310284e-05,
"loss": 1.7667,
"step": 1748
},
{
"epoch": 0.42236174836995893,
"grad_norm": 0.33314794301986694,
"learning_rate": 5.109361093159793e-05,
"loss": 1.8291,
"step": 1749
},
{
"epoch": 0.4226032359333494,
"grad_norm": 0.3257341682910919,
"learning_rate": 5.106385548100444e-05,
"loss": 1.8156,
"step": 1750
},
{
"epoch": 0.4228447234967399,
"grad_norm": 0.314256876707077,
"learning_rate": 5.103409339936354e-05,
"loss": 1.8064,
"step": 1751
},
{
"epoch": 0.4230862110601304,
"grad_norm": 0.31599828600883484,
"learning_rate": 5.100432470451294e-05,
"loss": 1.6887,
"step": 1752
},
{
"epoch": 0.4233276986235209,
"grad_norm": 0.30414825677871704,
"learning_rate": 5.0974549414294316e-05,
"loss": 1.6797,
"step": 1753
},
{
"epoch": 0.42356918618691136,
"grad_norm": 0.3340110182762146,
"learning_rate": 5.0944767546553264e-05,
"loss": 1.9084,
"step": 1754
},
{
"epoch": 0.42381067375030185,
"grad_norm": 0.3074990510940552,
"learning_rate": 5.091497911913938e-05,
"loss": 1.6124,
"step": 1755
},
{
"epoch": 0.42405216131369233,
"grad_norm": 0.31545865535736084,
"learning_rate": 5.088518414990614e-05,
"loss": 1.6553,
"step": 1756
},
{
"epoch": 0.4242936488770828,
"grad_norm": 0.3098644018173218,
"learning_rate": 5.0855382656710944e-05,
"loss": 1.6836,
"step": 1757
},
{
"epoch": 0.4245351364404733,
"grad_norm": 0.32377690076828003,
"learning_rate": 5.082557465741513e-05,
"loss": 1.8453,
"step": 1758
},
{
"epoch": 0.4247766240038638,
"grad_norm": 0.3402831554412842,
"learning_rate": 5.0795760169883926e-05,
"loss": 1.7824,
"step": 1759
},
{
"epoch": 0.4250181115672543,
"grad_norm": 0.30646243691444397,
"learning_rate": 5.076593921198644e-05,
"loss": 1.6201,
"step": 1760
},
{
"epoch": 0.42525959913064476,
"grad_norm": 0.3204982876777649,
"learning_rate": 5.0736111801595674e-05,
"loss": 1.8092,
"step": 1761
},
{
"epoch": 0.42550108669403525,
"grad_norm": 0.34092098474502563,
"learning_rate": 5.0706277956588456e-05,
"loss": 1.8603,
"step": 1762
},
{
"epoch": 0.42574257425742573,
"grad_norm": 0.3115682899951935,
"learning_rate": 5.0676437694845544e-05,
"loss": 1.7216,
"step": 1763
},
{
"epoch": 0.4259840618208162,
"grad_norm": 0.2900623083114624,
"learning_rate": 5.064659103425145e-05,
"loss": 1.5347,
"step": 1764
},
{
"epoch": 0.4262255493842067,
"grad_norm": 0.34669458866119385,
"learning_rate": 5.0616737992694595e-05,
"loss": 2.0433,
"step": 1765
},
{
"epoch": 0.4264670369475972,
"grad_norm": 0.32388561964035034,
"learning_rate": 5.0586878588067215e-05,
"loss": 1.8193,
"step": 1766
},
{
"epoch": 0.4267085245109877,
"grad_norm": 0.32599443197250366,
"learning_rate": 5.0557012838265326e-05,
"loss": 1.6705,
"step": 1767
},
{
"epoch": 0.42695001207437816,
"grad_norm": 0.31643036007881165,
"learning_rate": 5.052714076118875e-05,
"loss": 1.6169,
"step": 1768
},
{
"epoch": 0.42719149963776865,
"grad_norm": 0.301062673330307,
"learning_rate": 5.0497262374741136e-05,
"loss": 1.706,
"step": 1769
},
{
"epoch": 0.42743298720115913,
"grad_norm": 0.31782886385917664,
"learning_rate": 5.046737769682989e-05,
"loss": 1.9235,
"step": 1770
},
{
"epoch": 0.4276744747645496,
"grad_norm": 0.3196124732494354,
"learning_rate": 5.043748674536618e-05,
"loss": 1.7779,
"step": 1771
},
{
"epoch": 0.4279159623279401,
"grad_norm": 0.31023970246315,
"learning_rate": 5.0407589538264974e-05,
"loss": 1.6582,
"step": 1772
},
{
"epoch": 0.4281574498913306,
"grad_norm": 0.31953737139701843,
"learning_rate": 5.0377686093444945e-05,
"loss": 1.6437,
"step": 1773
},
{
"epoch": 0.4283989374547211,
"grad_norm": 0.3527478873729706,
"learning_rate": 5.03477764288285e-05,
"loss": 1.9902,
"step": 1774
},
{
"epoch": 0.42864042501811156,
"grad_norm": 0.3176495134830475,
"learning_rate": 5.0317860562341825e-05,
"loss": 1.7831,
"step": 1775
},
{
"epoch": 0.42888191258150204,
"grad_norm": 0.3193947374820709,
"learning_rate": 5.02879385119148e-05,
"loss": 1.752,
"step": 1776
},
{
"epoch": 0.42912340014489253,
"grad_norm": 0.322971910238266,
"learning_rate": 5.025801029548097e-05,
"loss": 1.6216,
"step": 1777
},
{
"epoch": 0.429364887708283,
"grad_norm": 0.3086382746696472,
"learning_rate": 5.022807593097765e-05,
"loss": 1.6701,
"step": 1778
},
{
"epoch": 0.4296063752716735,
"grad_norm": 0.3198978900909424,
"learning_rate": 5.0198135436345776e-05,
"loss": 1.7816,
"step": 1779
},
{
"epoch": 0.429847862835064,
"grad_norm": 0.3353576362133026,
"learning_rate": 5.0168188829529986e-05,
"loss": 1.762,
"step": 1780
},
{
"epoch": 0.4300893503984545,
"grad_norm": 0.3208022713661194,
"learning_rate": 5.0138236128478587e-05,
"loss": 1.8141,
"step": 1781
},
{
"epoch": 0.43033083796184496,
"grad_norm": 0.32314246892929077,
"learning_rate": 5.010827735114351e-05,
"loss": 1.7433,
"step": 1782
},
{
"epoch": 0.43057232552523544,
"grad_norm": 0.3072111904621124,
"learning_rate": 5.0078312515480356e-05,
"loss": 1.7538,
"step": 1783
},
{
"epoch": 0.43081381308862593,
"grad_norm": 0.316180020570755,
"learning_rate": 5.004834163944836e-05,
"loss": 1.7431,
"step": 1784
},
{
"epoch": 0.4310553006520164,
"grad_norm": 0.3349752724170685,
"learning_rate": 5.0018364741010345e-05,
"loss": 1.791,
"step": 1785
},
{
"epoch": 0.4312967882154069,
"grad_norm": 0.31984061002731323,
"learning_rate": 4.998838183813277e-05,
"loss": 1.838,
"step": 1786
},
{
"epoch": 0.4315382757787974,
"grad_norm": 0.31683188676834106,
"learning_rate": 4.995839294878569e-05,
"loss": 1.8307,
"step": 1787
},
{
"epoch": 0.43177976334218787,
"grad_norm": 0.32636767625808716,
"learning_rate": 4.992839809094276e-05,
"loss": 1.8039,
"step": 1788
},
{
"epoch": 0.43202125090557836,
"grad_norm": 0.3164781928062439,
"learning_rate": 4.9898397282581164e-05,
"loss": 1.8272,
"step": 1789
},
{
"epoch": 0.43226273846896884,
"grad_norm": 0.3037387430667877,
"learning_rate": 4.986839054168171e-05,
"loss": 1.6591,
"step": 1790
},
{
"epoch": 0.43250422603235933,
"grad_norm": 0.31159907579421997,
"learning_rate": 4.983837788622872e-05,
"loss": 1.708,
"step": 1791
},
{
"epoch": 0.4327457135957498,
"grad_norm": 0.3006117343902588,
"learning_rate": 4.980835933421008e-05,
"loss": 1.5216,
"step": 1792
},
{
"epoch": 0.4329872011591403,
"grad_norm": 0.320086270570755,
"learning_rate": 4.9778334903617225e-05,
"loss": 1.6478,
"step": 1793
},
{
"epoch": 0.4332286887225308,
"grad_norm": 0.3265068829059601,
"learning_rate": 4.9748304612445076e-05,
"loss": 1.833,
"step": 1794
},
{
"epoch": 0.43347017628592127,
"grad_norm": 0.32293495535850525,
"learning_rate": 4.971826847869209e-05,
"loss": 1.72,
"step": 1795
},
{
"epoch": 0.43371166384931176,
"grad_norm": 0.28712642192840576,
"learning_rate": 4.9688226520360225e-05,
"loss": 1.5015,
"step": 1796
},
{
"epoch": 0.43395315141270224,
"grad_norm": 0.34202128648757935,
"learning_rate": 4.965817875545493e-05,
"loss": 1.7086,
"step": 1797
},
{
"epoch": 0.43419463897609273,
"grad_norm": 0.3475635051727295,
"learning_rate": 4.962812520198512e-05,
"loss": 1.876,
"step": 1798
},
{
"epoch": 0.4344361265394832,
"grad_norm": 0.3257412314414978,
"learning_rate": 4.959806587796321e-05,
"loss": 1.7665,
"step": 1799
},
{
"epoch": 0.4346776141028737,
"grad_norm": 0.30491873621940613,
"learning_rate": 4.956800080140503e-05,
"loss": 1.7476,
"step": 1800
},
{
"epoch": 0.4349191016662642,
"grad_norm": 0.32391414046287537,
"learning_rate": 4.953792999032989e-05,
"loss": 1.8963,
"step": 1801
},
{
"epoch": 0.43516058922965467,
"grad_norm": 0.3363605737686157,
"learning_rate": 4.950785346276054e-05,
"loss": 1.7886,
"step": 1802
},
{
"epoch": 0.43540207679304516,
"grad_norm": 0.3271222412586212,
"learning_rate": 4.947777123672314e-05,
"loss": 1.8712,
"step": 1803
},
{
"epoch": 0.43564356435643564,
"grad_norm": 0.3130126893520355,
"learning_rate": 4.9447683330247254e-05,
"loss": 1.7719,
"step": 1804
},
{
"epoch": 0.4358850519198261,
"grad_norm": 0.33484476804733276,
"learning_rate": 4.941758976136588e-05,
"loss": 1.8265,
"step": 1805
},
{
"epoch": 0.4361265394832166,
"grad_norm": 0.3352862298488617,
"learning_rate": 4.93874905481154e-05,
"loss": 1.8212,
"step": 1806
},
{
"epoch": 0.4363680270466071,
"grad_norm": 0.3315581679344177,
"learning_rate": 4.935738570853557e-05,
"loss": 1.7995,
"step": 1807
},
{
"epoch": 0.4366095146099976,
"grad_norm": 0.3371587097644806,
"learning_rate": 4.93272752606695e-05,
"loss": 1.8149,
"step": 1808
},
{
"epoch": 0.43685100217338807,
"grad_norm": 0.32511308789253235,
"learning_rate": 4.9297159222563735e-05,
"loss": 1.8111,
"step": 1809
},
{
"epoch": 0.43709248973677856,
"grad_norm": 0.32551050186157227,
"learning_rate": 4.926703761226808e-05,
"loss": 1.5647,
"step": 1810
},
{
"epoch": 0.43733397730016904,
"grad_norm": 0.30943354964256287,
"learning_rate": 4.9236910447835735e-05,
"loss": 1.6284,
"step": 1811
},
{
"epoch": 0.4375754648635595,
"grad_norm": 0.3279415965080261,
"learning_rate": 4.920677774732321e-05,
"loss": 1.8771,
"step": 1812
},
{
"epoch": 0.43781695242695,
"grad_norm": 0.32760724425315857,
"learning_rate": 4.917663952879033e-05,
"loss": 1.5721,
"step": 1813
},
{
"epoch": 0.4380584399903405,
"grad_norm": 0.3225950598716736,
"learning_rate": 4.914649581030025e-05,
"loss": 1.7678,
"step": 1814
},
{
"epoch": 0.438299927553731,
"grad_norm": 0.31700098514556885,
"learning_rate": 4.91163466099194e-05,
"loss": 1.6486,
"step": 1815
},
{
"epoch": 0.43854141511712147,
"grad_norm": 0.3183005154132843,
"learning_rate": 4.9086191945717476e-05,
"loss": 1.5372,
"step": 1816
},
{
"epoch": 0.43878290268051195,
"grad_norm": 0.3154526352882385,
"learning_rate": 4.905603183576751e-05,
"loss": 1.619,
"step": 1817
},
{
"epoch": 0.43902439024390244,
"grad_norm": 0.32441118359565735,
"learning_rate": 4.902586629814574e-05,
"loss": 1.7581,
"step": 1818
},
{
"epoch": 0.4392658778072929,
"grad_norm": 0.31786707043647766,
"learning_rate": 4.899569535093167e-05,
"loss": 1.6648,
"step": 1819
},
{
"epoch": 0.4395073653706834,
"grad_norm": 0.30324339866638184,
"learning_rate": 4.8965519012208085e-05,
"loss": 1.6787,
"step": 1820
},
{
"epoch": 0.4397488529340739,
"grad_norm": 0.3461436629295349,
"learning_rate": 4.893533730006095e-05,
"loss": 1.6268,
"step": 1821
},
{
"epoch": 0.4399903404974644,
"grad_norm": 0.3194788098335266,
"learning_rate": 4.890515023257946e-05,
"loss": 1.8323,
"step": 1822
},
{
"epoch": 0.44023182806085487,
"grad_norm": 0.3068380057811737,
"learning_rate": 4.887495782785605e-05,
"loss": 1.8317,
"step": 1823
},
{
"epoch": 0.44047331562424535,
"grad_norm": 0.3199669420719147,
"learning_rate": 4.8844760103986346e-05,
"loss": 1.8499,
"step": 1824
},
{
"epoch": 0.44071480318763584,
"grad_norm": 0.3064357042312622,
"learning_rate": 4.881455707906911e-05,
"loss": 1.6994,
"step": 1825
},
{
"epoch": 0.4409562907510263,
"grad_norm": 0.32749029994010925,
"learning_rate": 4.8784348771206366e-05,
"loss": 1.742,
"step": 1826
},
{
"epoch": 0.4411977783144168,
"grad_norm": 0.29773974418640137,
"learning_rate": 4.875413519850323e-05,
"loss": 1.5753,
"step": 1827
},
{
"epoch": 0.4414392658778073,
"grad_norm": 0.314562052488327,
"learning_rate": 4.872391637906802e-05,
"loss": 1.729,
"step": 1828
},
{
"epoch": 0.4416807534411978,
"grad_norm": 0.3068162202835083,
"learning_rate": 4.869369233101217e-05,
"loss": 1.6617,
"step": 1829
},
{
"epoch": 0.44192224100458827,
"grad_norm": 0.31325581669807434,
"learning_rate": 4.866346307245027e-05,
"loss": 1.8767,
"step": 1830
},
{
"epoch": 0.44216372856797875,
"grad_norm": 0.3100229501724243,
"learning_rate": 4.8633228621500014e-05,
"loss": 1.6149,
"step": 1831
},
{
"epoch": 0.44240521613136924,
"grad_norm": 0.3256266415119171,
"learning_rate": 4.8602988996282235e-05,
"loss": 1.7831,
"step": 1832
},
{
"epoch": 0.4426467036947597,
"grad_norm": 0.337890088558197,
"learning_rate": 4.857274421492082e-05,
"loss": 1.865,
"step": 1833
},
{
"epoch": 0.4428881912581502,
"grad_norm": 0.3197672963142395,
"learning_rate": 4.854249429554281e-05,
"loss": 1.8182,
"step": 1834
},
{
"epoch": 0.4431296788215407,
"grad_norm": 0.31269827485084534,
"learning_rate": 4.851223925627826e-05,
"loss": 1.6953,
"step": 1835
},
{
"epoch": 0.4433711663849312,
"grad_norm": 0.30737265944480896,
"learning_rate": 4.848197911526034e-05,
"loss": 1.6799,
"step": 1836
},
{
"epoch": 0.44361265394832167,
"grad_norm": 0.3163803815841675,
"learning_rate": 4.8451713890625265e-05,
"loss": 1.6822,
"step": 1837
},
{
"epoch": 0.44385414151171215,
"grad_norm": 0.3576357662677765,
"learning_rate": 4.842144360051228e-05,
"loss": 1.7801,
"step": 1838
},
{
"epoch": 0.44409562907510264,
"grad_norm": 0.3141801059246063,
"learning_rate": 4.839116826306369e-05,
"loss": 1.842,
"step": 1839
},
{
"epoch": 0.4443371166384931,
"grad_norm": 0.3117457330226898,
"learning_rate": 4.836088789642482e-05,
"loss": 1.6693,
"step": 1840
},
{
"epoch": 0.4445786042018836,
"grad_norm": 0.3110361695289612,
"learning_rate": 4.833060251874399e-05,
"loss": 1.7368,
"step": 1841
},
{
"epoch": 0.4448200917652741,
"grad_norm": 0.3443051278591156,
"learning_rate": 4.830031214817253e-05,
"loss": 1.857,
"step": 1842
},
{
"epoch": 0.4450615793286646,
"grad_norm": 0.39485305547714233,
"learning_rate": 4.827001680286481e-05,
"loss": 1.8448,
"step": 1843
},
{
"epoch": 0.44530306689205507,
"grad_norm": 0.3087663948535919,
"learning_rate": 4.8239716500978106e-05,
"loss": 1.7263,
"step": 1844
},
{
"epoch": 0.44554455445544555,
"grad_norm": 0.32222047448158264,
"learning_rate": 4.8209411260672705e-05,
"loss": 1.8257,
"step": 1845
},
{
"epoch": 0.44578604201883604,
"grad_norm": 0.322906494140625,
"learning_rate": 4.8179101100111864e-05,
"loss": 1.6751,
"step": 1846
},
{
"epoch": 0.4460275295822265,
"grad_norm": 0.3205435872077942,
"learning_rate": 4.8148786037461764e-05,
"loss": 1.8693,
"step": 1847
},
{
"epoch": 0.446269017145617,
"grad_norm": 0.31261250376701355,
"learning_rate": 4.811846609089153e-05,
"loss": 1.6956,
"step": 1848
},
{
"epoch": 0.4465105047090075,
"grad_norm": 0.3247355818748474,
"learning_rate": 4.808814127857322e-05,
"loss": 1.7054,
"step": 1849
},
{
"epoch": 0.446751992272398,
"grad_norm": 0.3224380612373352,
"learning_rate": 4.805781161868182e-05,
"loss": 1.6681,
"step": 1850
},
{
"epoch": 0.44699347983578847,
"grad_norm": 0.3073568344116211,
"learning_rate": 4.802747712939518e-05,
"loss": 1.6864,
"step": 1851
},
{
"epoch": 0.44723496739917895,
"grad_norm": 0.34604325890541077,
"learning_rate": 4.799713782889409e-05,
"loss": 1.969,
"step": 1852
},
{
"epoch": 0.44747645496256944,
"grad_norm": 0.3278951346874237,
"learning_rate": 4.796679373536222e-05,
"loss": 1.6306,
"step": 1853
},
{
"epoch": 0.4477179425259599,
"grad_norm": 0.3199866712093353,
"learning_rate": 4.7936444866986066e-05,
"loss": 1.6913,
"step": 1854
},
{
"epoch": 0.4479594300893504,
"grad_norm": 0.32705411314964294,
"learning_rate": 4.790609124195506e-05,
"loss": 1.8419,
"step": 1855
},
{
"epoch": 0.4482009176527409,
"grad_norm": 0.3279324471950531,
"learning_rate": 4.78757328784614e-05,
"loss": 1.711,
"step": 1856
},
{
"epoch": 0.4484424052161314,
"grad_norm": 0.3183402419090271,
"learning_rate": 4.7845369794700185e-05,
"loss": 1.7563,
"step": 1857
},
{
"epoch": 0.44868389277952186,
"grad_norm": 0.3299994170665741,
"learning_rate": 4.781500200886934e-05,
"loss": 1.7747,
"step": 1858
},
{
"epoch": 0.44892538034291235,
"grad_norm": 0.33904218673706055,
"learning_rate": 4.7784629539169555e-05,
"loss": 1.7146,
"step": 1859
},
{
"epoch": 0.44916686790630284,
"grad_norm": 0.31083980202674866,
"learning_rate": 4.7754252403804404e-05,
"loss": 1.6899,
"step": 1860
},
{
"epoch": 0.4494083554696933,
"grad_norm": 0.320126473903656,
"learning_rate": 4.7723870620980206e-05,
"loss": 1.65,
"step": 1861
},
{
"epoch": 0.4496498430330838,
"grad_norm": 0.322860985994339,
"learning_rate": 4.769348420890607e-05,
"loss": 1.7541,
"step": 1862
},
{
"epoch": 0.4498913305964743,
"grad_norm": 0.3172602653503418,
"learning_rate": 4.766309318579391e-05,
"loss": 1.6166,
"step": 1863
},
{
"epoch": 0.4501328181598648,
"grad_norm": 0.33034148812294006,
"learning_rate": 4.7632697569858336e-05,
"loss": 1.9764,
"step": 1864
},
{
"epoch": 0.45037430572325526,
"grad_norm": 0.3384269177913666,
"learning_rate": 4.760229737931681e-05,
"loss": 1.7827,
"step": 1865
},
{
"epoch": 0.45061579328664575,
"grad_norm": 0.3293705880641937,
"learning_rate": 4.7571892632389454e-05,
"loss": 1.7764,
"step": 1866
},
{
"epoch": 0.45085728085003623,
"grad_norm": 0.32411205768585205,
"learning_rate": 4.7541483347299154e-05,
"loss": 1.7321,
"step": 1867
},
{
"epoch": 0.4510987684134267,
"grad_norm": 0.3312840163707733,
"learning_rate": 4.7511069542271504e-05,
"loss": 1.8471,
"step": 1868
},
{
"epoch": 0.4513402559768172,
"grad_norm": 0.33269646763801575,
"learning_rate": 4.748065123553481e-05,
"loss": 1.7057,
"step": 1869
},
{
"epoch": 0.4515817435402077,
"grad_norm": 0.32271480560302734,
"learning_rate": 4.74502284453201e-05,
"loss": 1.7683,
"step": 1870
},
{
"epoch": 0.4518232311035982,
"grad_norm": 0.32621634006500244,
"learning_rate": 4.7419801189861065e-05,
"loss": 1.9058,
"step": 1871
},
{
"epoch": 0.45206471866698866,
"grad_norm": 0.31796547770500183,
"learning_rate": 4.7389369487394046e-05,
"loss": 1.5809,
"step": 1872
},
{
"epoch": 0.45230620623037915,
"grad_norm": 0.31985053420066833,
"learning_rate": 4.735893335615812e-05,
"loss": 1.7732,
"step": 1873
},
{
"epoch": 0.45254769379376963,
"grad_norm": 0.3129877746105194,
"learning_rate": 4.732849281439495e-05,
"loss": 1.7053,
"step": 1874
},
{
"epoch": 0.4527891813571601,
"grad_norm": 0.3248676359653473,
"learning_rate": 4.729804788034887e-05,
"loss": 1.9495,
"step": 1875
},
{
"epoch": 0.4530306689205506,
"grad_norm": 0.32636207342147827,
"learning_rate": 4.726759857226688e-05,
"loss": 1.906,
"step": 1876
},
{
"epoch": 0.4532721564839411,
"grad_norm": 0.31957873702049255,
"learning_rate": 4.723714490839853e-05,
"loss": 1.7117,
"step": 1877
},
{
"epoch": 0.4535136440473316,
"grad_norm": 0.33045974373817444,
"learning_rate": 4.720668690699603e-05,
"loss": 1.7147,
"step": 1878
},
{
"epoch": 0.45375513161072206,
"grad_norm": 0.3191014230251312,
"learning_rate": 4.717622458631418e-05,
"loss": 1.6461,
"step": 1879
},
{
"epoch": 0.45399661917411255,
"grad_norm": 0.33815374970436096,
"learning_rate": 4.714575796461038e-05,
"loss": 1.7384,
"step": 1880
},
{
"epoch": 0.45423810673750303,
"grad_norm": 0.3154662847518921,
"learning_rate": 4.711528706014457e-05,
"loss": 1.8105,
"step": 1881
},
{
"epoch": 0.4544795943008935,
"grad_norm": 0.3145321011543274,
"learning_rate": 4.70848118911793e-05,
"loss": 1.6214,
"step": 1882
},
{
"epoch": 0.454721081864284,
"grad_norm": 0.3181321322917938,
"learning_rate": 4.705433247597965e-05,
"loss": 1.6951,
"step": 1883
},
{
"epoch": 0.4549625694276745,
"grad_norm": 0.338344007730484,
"learning_rate": 4.702384883281325e-05,
"loss": 1.8272,
"step": 1884
},
{
"epoch": 0.455204056991065,
"grad_norm": 0.33954915404319763,
"learning_rate": 4.699336097995027e-05,
"loss": 1.9373,
"step": 1885
},
{
"epoch": 0.45544554455445546,
"grad_norm": 0.3360753655433655,
"learning_rate": 4.696286893566341e-05,
"loss": 1.5562,
"step": 1886
},
{
"epoch": 0.45568703211784595,
"grad_norm": 0.3098269999027252,
"learning_rate": 4.693237271822786e-05,
"loss": 1.6535,
"step": 1887
},
{
"epoch": 0.45592851968123643,
"grad_norm": 0.3185242712497711,
"learning_rate": 4.6901872345921326e-05,
"loss": 1.7053,
"step": 1888
},
{
"epoch": 0.4561700072446269,
"grad_norm": 0.3227466642856598,
"learning_rate": 4.6871367837024e-05,
"loss": 1.8213,
"step": 1889
},
{
"epoch": 0.4564114948080174,
"grad_norm": 0.32636722922325134,
"learning_rate": 4.6840859209818554e-05,
"loss": 1.8187,
"step": 1890
},
{
"epoch": 0.4566529823714079,
"grad_norm": 0.318192720413208,
"learning_rate": 4.681034648259014e-05,
"loss": 1.7479,
"step": 1891
},
{
"epoch": 0.4568944699347984,
"grad_norm": 0.30496731400489807,
"learning_rate": 4.677982967362633e-05,
"loss": 1.7133,
"step": 1892
},
{
"epoch": 0.45713595749818886,
"grad_norm": 0.33690890669822693,
"learning_rate": 4.674930880121719e-05,
"loss": 1.7466,
"step": 1893
},
{
"epoch": 0.45737744506157935,
"grad_norm": 0.31268423795700073,
"learning_rate": 4.67187838836552e-05,
"loss": 1.8265,
"step": 1894
},
{
"epoch": 0.45761893262496983,
"grad_norm": 0.33327123522758484,
"learning_rate": 4.668825493923525e-05,
"loss": 1.9799,
"step": 1895
},
{
"epoch": 0.4578604201883603,
"grad_norm": 0.30676886439323425,
"learning_rate": 4.6657721986254674e-05,
"loss": 1.6389,
"step": 1896
},
{
"epoch": 0.4581019077517508,
"grad_norm": 0.3276241719722748,
"learning_rate": 4.6627185043013165e-05,
"loss": 1.7445,
"step": 1897
},
{
"epoch": 0.4583433953151413,
"grad_norm": 0.30695146322250366,
"learning_rate": 4.659664412781286e-05,
"loss": 1.6091,
"step": 1898
},
{
"epoch": 0.4585848828785318,
"grad_norm": 0.31264829635620117,
"learning_rate": 4.656609925895826e-05,
"loss": 1.7049,
"step": 1899
},
{
"epoch": 0.45882637044192226,
"grad_norm": 0.32918858528137207,
"learning_rate": 4.65355504547562e-05,
"loss": 1.795,
"step": 1900
},
{
"epoch": 0.45906785800531275,
"grad_norm": 0.31754815578460693,
"learning_rate": 4.6504997733515904e-05,
"loss": 1.7422,
"step": 1901
},
{
"epoch": 0.45930934556870323,
"grad_norm": 0.33143150806427,
"learning_rate": 4.6474441113548957e-05,
"loss": 1.8414,
"step": 1902
},
{
"epoch": 0.4595508331320937,
"grad_norm": 0.31607118248939514,
"learning_rate": 4.6443880613169254e-05,
"loss": 1.6834,
"step": 1903
},
{
"epoch": 0.4597923206954842,
"grad_norm": 0.3158678114414215,
"learning_rate": 4.641331625069302e-05,
"loss": 1.6669,
"step": 1904
},
{
"epoch": 0.4600338082588747,
"grad_norm": 0.3216167688369751,
"learning_rate": 4.6382748044438815e-05,
"loss": 1.7106,
"step": 1905
},
{
"epoch": 0.4602752958222652,
"grad_norm": 0.38955986499786377,
"learning_rate": 4.6352176012727484e-05,
"loss": 1.788,
"step": 1906
},
{
"epoch": 0.46051678338565566,
"grad_norm": 0.3093554675579071,
"learning_rate": 4.632160017388215e-05,
"loss": 1.663,
"step": 1907
},
{
"epoch": 0.46075827094904614,
"grad_norm": 0.32816994190216064,
"learning_rate": 4.629102054622825e-05,
"loss": 1.7601,
"step": 1908
},
{
"epoch": 0.46099975851243663,
"grad_norm": 0.3421451449394226,
"learning_rate": 4.626043714809348e-05,
"loss": 1.7477,
"step": 1909
},
{
"epoch": 0.4612412460758271,
"grad_norm": 0.3192618191242218,
"learning_rate": 4.622984999780779e-05,
"loss": 1.6711,
"step": 1910
},
{
"epoch": 0.4614827336392176,
"grad_norm": 0.3109111785888672,
"learning_rate": 4.61992591137034e-05,
"loss": 1.6517,
"step": 1911
},
{
"epoch": 0.4617242212026081,
"grad_norm": 0.3304436504840851,
"learning_rate": 4.6168664514114723e-05,
"loss": 1.7932,
"step": 1912
},
{
"epoch": 0.4619657087659986,
"grad_norm": 0.3186758756637573,
"learning_rate": 4.613806621737844e-05,
"loss": 1.7554,
"step": 1913
},
{
"epoch": 0.46220719632938906,
"grad_norm": 0.31981173157691956,
"learning_rate": 4.6107464241833436e-05,
"loss": 1.7032,
"step": 1914
},
{
"epoch": 0.46244868389277954,
"grad_norm": 0.3194178342819214,
"learning_rate": 4.6076858605820804e-05,
"loss": 1.6827,
"step": 1915
},
{
"epoch": 0.46269017145617003,
"grad_norm": 0.34643322229385376,
"learning_rate": 4.604624932768382e-05,
"loss": 2.0343,
"step": 1916
},
{
"epoch": 0.4629316590195605,
"grad_norm": 0.32240161299705505,
"learning_rate": 4.6015636425767933e-05,
"loss": 1.7716,
"step": 1917
},
{
"epoch": 0.463173146582951,
"grad_norm": 0.3118249773979187,
"learning_rate": 4.59850199184208e-05,
"loss": 1.7507,
"step": 1918
},
{
"epoch": 0.4634146341463415,
"grad_norm": 0.32204747200012207,
"learning_rate": 4.595439982399222e-05,
"loss": 1.6723,
"step": 1919
},
{
"epoch": 0.46365612170973197,
"grad_norm": 0.3252248764038086,
"learning_rate": 4.592377616083413e-05,
"loss": 1.8006,
"step": 1920
},
{
"epoch": 0.46389760927312246,
"grad_norm": 0.3427707552909851,
"learning_rate": 4.5893148947300636e-05,
"loss": 1.8713,
"step": 1921
},
{
"epoch": 0.46413909683651294,
"grad_norm": 0.3312002122402191,
"learning_rate": 4.5862518201747926e-05,
"loss": 1.7791,
"step": 1922
},
{
"epoch": 0.46438058439990343,
"grad_norm": 0.3222915828227997,
"learning_rate": 4.5831883942534344e-05,
"loss": 1.7691,
"step": 1923
},
{
"epoch": 0.4646220719632939,
"grad_norm": 0.3127139210700989,
"learning_rate": 4.580124618802034e-05,
"loss": 1.7361,
"step": 1924
},
{
"epoch": 0.4648635595266844,
"grad_norm": 0.30985063314437866,
"learning_rate": 4.577060495656842e-05,
"loss": 1.706,
"step": 1925
},
{
"epoch": 0.4651050470900749,
"grad_norm": 0.3158462643623352,
"learning_rate": 4.573996026654321e-05,
"loss": 1.7321,
"step": 1926
},
{
"epoch": 0.46534653465346537,
"grad_norm": 0.3284815847873688,
"learning_rate": 4.570931213631141e-05,
"loss": 1.6042,
"step": 1927
},
{
"epoch": 0.46558802221685586,
"grad_norm": 0.3247036039829254,
"learning_rate": 4.567866058424176e-05,
"loss": 1.6458,
"step": 1928
},
{
"epoch": 0.46582950978024634,
"grad_norm": 0.31772297620773315,
"learning_rate": 4.564800562870506e-05,
"loss": 1.7685,
"step": 1929
},
{
"epoch": 0.46607099734363683,
"grad_norm": 0.3419104218482971,
"learning_rate": 4.561734728807417e-05,
"loss": 1.9509,
"step": 1930
},
{
"epoch": 0.4663124849070273,
"grad_norm": 0.3184857964515686,
"learning_rate": 4.558668558072393e-05,
"loss": 1.6747,
"step": 1931
},
{
"epoch": 0.4665539724704178,
"grad_norm": 0.3354939818382263,
"learning_rate": 4.555602052503126e-05,
"loss": 1.8638,
"step": 1932
},
{
"epoch": 0.4667954600338083,
"grad_norm": 0.3130846619606018,
"learning_rate": 4.5525352139375035e-05,
"loss": 1.716,
"step": 1933
},
{
"epoch": 0.46703694759719877,
"grad_norm": 0.3140762150287628,
"learning_rate": 4.5494680442136144e-05,
"loss": 1.7392,
"step": 1934
},
{
"epoch": 0.46727843516058926,
"grad_norm": 0.32126384973526,
"learning_rate": 4.546400545169748e-05,
"loss": 1.879,
"step": 1935
},
{
"epoch": 0.46751992272397974,
"grad_norm": 0.31407633423805237,
"learning_rate": 4.543332718644388e-05,
"loss": 1.631,
"step": 1936
},
{
"epoch": 0.4677614102873702,
"grad_norm": 0.3271917402744293,
"learning_rate": 4.5402645664762144e-05,
"loss": 1.7332,
"step": 1937
},
{
"epoch": 0.4680028978507607,
"grad_norm": 0.3262588381767273,
"learning_rate": 4.5371960905041066e-05,
"loss": 1.7904,
"step": 1938
},
{
"epoch": 0.4682443854141512,
"grad_norm": 0.3321874439716339,
"learning_rate": 4.534127292567133e-05,
"loss": 1.8836,
"step": 1939
},
{
"epoch": 0.4684858729775417,
"grad_norm": 0.32539454102516174,
"learning_rate": 4.531058174504557e-05,
"loss": 1.8183,
"step": 1940
},
{
"epoch": 0.46872736054093217,
"grad_norm": 0.31996139883995056,
"learning_rate": 4.5279887381558335e-05,
"loss": 1.8423,
"step": 1941
},
{
"epoch": 0.4689688481043226,
"grad_norm": 0.2960781753063202,
"learning_rate": 4.524918985360611e-05,
"loss": 1.5413,
"step": 1942
},
{
"epoch": 0.4692103356677131,
"grad_norm": 0.33326393365859985,
"learning_rate": 4.521848917958721e-05,
"loss": 1.7277,
"step": 1943
},
{
"epoch": 0.46945182323110357,
"grad_norm": 0.30825114250183105,
"learning_rate": 4.518778537790193e-05,
"loss": 1.5946,
"step": 1944
},
{
"epoch": 0.46969331079449406,
"grad_norm": 0.3104898929595947,
"learning_rate": 4.515707846695235e-05,
"loss": 1.5605,
"step": 1945
},
{
"epoch": 0.46993479835788454,
"grad_norm": 0.3065233826637268,
"learning_rate": 4.512636846514245e-05,
"loss": 1.6081,
"step": 1946
},
{
"epoch": 0.47017628592127503,
"grad_norm": 0.32400989532470703,
"learning_rate": 4.509565539087809e-05,
"loss": 1.7397,
"step": 1947
},
{
"epoch": 0.4704177734846655,
"grad_norm": 0.31074362993240356,
"learning_rate": 4.506493926256692e-05,
"loss": 1.7263,
"step": 1948
},
{
"epoch": 0.470659261048056,
"grad_norm": 0.3119424283504486,
"learning_rate": 4.5034220098618445e-05,
"loss": 1.6285,
"step": 1949
},
{
"epoch": 0.4709007486114465,
"grad_norm": 0.3202967345714569,
"learning_rate": 4.500349791744401e-05,
"loss": 1.6423,
"step": 1950
},
{
"epoch": 0.47114223617483697,
"grad_norm": 0.3224698603153229,
"learning_rate": 4.4972772737456734e-05,
"loss": 1.8148,
"step": 1951
},
{
"epoch": 0.47138372373822746,
"grad_norm": 0.3153221607208252,
"learning_rate": 4.494204457707153e-05,
"loss": 1.6917,
"step": 1952
},
{
"epoch": 0.47162521130161794,
"grad_norm": 0.32202938199043274,
"learning_rate": 4.4911313454705155e-05,
"loss": 1.8316,
"step": 1953
},
{
"epoch": 0.4718666988650084,
"grad_norm": 0.330608606338501,
"learning_rate": 4.488057938877607e-05,
"loss": 1.7924,
"step": 1954
},
{
"epoch": 0.4721081864283989,
"grad_norm": 0.32101622223854065,
"learning_rate": 4.484984239770454e-05,
"loss": 1.7442,
"step": 1955
},
{
"epoch": 0.4723496739917894,
"grad_norm": 0.3142457604408264,
"learning_rate": 4.4819102499912575e-05,
"loss": 1.6354,
"step": 1956
},
{
"epoch": 0.4725911615551799,
"grad_norm": 0.3051566183567047,
"learning_rate": 4.478835971382392e-05,
"loss": 1.6723,
"step": 1957
},
{
"epoch": 0.47283264911857037,
"grad_norm": 0.31328076124191284,
"learning_rate": 4.475761405786407e-05,
"loss": 1.6896,
"step": 1958
},
{
"epoch": 0.47307413668196086,
"grad_norm": 0.3216973841190338,
"learning_rate": 4.4726865550460215e-05,
"loss": 1.7345,
"step": 1959
},
{
"epoch": 0.47331562424535134,
"grad_norm": 0.3146194517612457,
"learning_rate": 4.469611421004126e-05,
"loss": 1.6428,
"step": 1960
},
{
"epoch": 0.4735571118087418,
"grad_norm": 0.33474940061569214,
"learning_rate": 4.4665360055037834e-05,
"loss": 1.7699,
"step": 1961
},
{
"epoch": 0.4737985993721323,
"grad_norm": 0.30783769488334656,
"learning_rate": 4.463460310388222e-05,
"loss": 1.6049,
"step": 1962
},
{
"epoch": 0.4740400869355228,
"grad_norm": 0.3315912187099457,
"learning_rate": 4.4603843375008387e-05,
"loss": 1.7062,
"step": 1963
},
{
"epoch": 0.4742815744989133,
"grad_norm": 0.33379220962524414,
"learning_rate": 4.457308088685197e-05,
"loss": 1.8349,
"step": 1964
},
{
"epoch": 0.47452306206230377,
"grad_norm": 0.29385891556739807,
"learning_rate": 4.454231565785029e-05,
"loss": 1.5972,
"step": 1965
},
{
"epoch": 0.47476454962569425,
"grad_norm": 0.33387261629104614,
"learning_rate": 4.451154770644224e-05,
"loss": 1.8021,
"step": 1966
},
{
"epoch": 0.47500603718908474,
"grad_norm": 0.346824049949646,
"learning_rate": 4.4480777051068416e-05,
"loss": 1.7912,
"step": 1967
},
{
"epoch": 0.4752475247524752,
"grad_norm": 0.3210572302341461,
"learning_rate": 4.445000371017099e-05,
"loss": 1.7741,
"step": 1968
},
{
"epoch": 0.4754890123158657,
"grad_norm": 0.3143101632595062,
"learning_rate": 4.441922770219374e-05,
"loss": 1.5724,
"step": 1969
},
{
"epoch": 0.4757304998792562,
"grad_norm": 0.3186543881893158,
"learning_rate": 4.4388449045582086e-05,
"loss": 1.6874,
"step": 1970
},
{
"epoch": 0.4759719874426467,
"grad_norm": 0.3205025792121887,
"learning_rate": 4.4357667758783e-05,
"loss": 1.5621,
"step": 1971
},
{
"epoch": 0.47621347500603717,
"grad_norm": 0.3176744282245636,
"learning_rate": 4.432688386024503e-05,
"loss": 1.8236,
"step": 1972
},
{
"epoch": 0.47645496256942765,
"grad_norm": 0.33443495631217957,
"learning_rate": 4.429609736841832e-05,
"loss": 1.9467,
"step": 1973
},
{
"epoch": 0.47669645013281814,
"grad_norm": 0.3172236680984497,
"learning_rate": 4.426530830175452e-05,
"loss": 1.776,
"step": 1974
},
{
"epoch": 0.4769379376962086,
"grad_norm": 0.3080536425113678,
"learning_rate": 4.423451667870686e-05,
"loss": 1.6937,
"step": 1975
},
{
"epoch": 0.4771794252595991,
"grad_norm": 0.31537625193595886,
"learning_rate": 4.4203722517730104e-05,
"loss": 1.6426,
"step": 1976
},
{
"epoch": 0.4774209128229896,
"grad_norm": 0.30593976378440857,
"learning_rate": 4.417292583728053e-05,
"loss": 1.663,
"step": 1977
},
{
"epoch": 0.4776624003863801,
"grad_norm": 0.3199318051338196,
"learning_rate": 4.4142126655815886e-05,
"loss": 1.7582,
"step": 1978
},
{
"epoch": 0.47790388794977057,
"grad_norm": 0.3328000009059906,
"learning_rate": 4.411132499179549e-05,
"loss": 1.7726,
"step": 1979
},
{
"epoch": 0.47814537551316105,
"grad_norm": 0.31644874811172485,
"learning_rate": 4.4080520863680106e-05,
"loss": 1.7679,
"step": 1980
},
{
"epoch": 0.47838686307655154,
"grad_norm": 0.3406371474266052,
"learning_rate": 4.4049714289931956e-05,
"loss": 1.9363,
"step": 1981
},
{
"epoch": 0.478628350639942,
"grad_norm": 0.3192148804664612,
"learning_rate": 4.401890528901479e-05,
"loss": 1.7492,
"step": 1982
},
{
"epoch": 0.4788698382033325,
"grad_norm": 0.3432200849056244,
"learning_rate": 4.3988093879393754e-05,
"loss": 1.7355,
"step": 1983
},
{
"epoch": 0.479111325766723,
"grad_norm": 0.30041298270225525,
"learning_rate": 4.395728007953545e-05,
"loss": 1.7963,
"step": 1984
},
{
"epoch": 0.4793528133301135,
"grad_norm": 0.2945508360862732,
"learning_rate": 4.392646390790794e-05,
"loss": 1.5881,
"step": 1985
},
{
"epoch": 0.47959430089350397,
"grad_norm": 0.3067844808101654,
"learning_rate": 4.389564538298068e-05,
"loss": 1.677,
"step": 1986
},
{
"epoch": 0.47983578845689445,
"grad_norm": 0.29964399337768555,
"learning_rate": 4.386482452322456e-05,
"loss": 1.4658,
"step": 1987
},
{
"epoch": 0.48007727602028494,
"grad_norm": 0.3236359655857086,
"learning_rate": 4.383400134711183e-05,
"loss": 1.652,
"step": 1988
},
{
"epoch": 0.4803187635836754,
"grad_norm": 0.30299097299575806,
"learning_rate": 4.380317587311618e-05,
"loss": 1.6701,
"step": 1989
},
{
"epoch": 0.4805602511470659,
"grad_norm": 0.3327222466468811,
"learning_rate": 4.377234811971263e-05,
"loss": 1.6186,
"step": 1990
},
{
"epoch": 0.4808017387104564,
"grad_norm": 0.3213178217411041,
"learning_rate": 4.374151810537759e-05,
"loss": 1.6802,
"step": 1991
},
{
"epoch": 0.4810432262738469,
"grad_norm": 0.3151525855064392,
"learning_rate": 4.3710685848588846e-05,
"loss": 1.7172,
"step": 1992
},
{
"epoch": 0.48128471383723737,
"grad_norm": 0.31488415598869324,
"learning_rate": 4.367985136782547e-05,
"loss": 1.6706,
"step": 1993
},
{
"epoch": 0.48152620140062785,
"grad_norm": 0.34251371026039124,
"learning_rate": 4.3649014681567914e-05,
"loss": 1.9582,
"step": 1994
},
{
"epoch": 0.48176768896401834,
"grad_norm": 0.3280927240848541,
"learning_rate": 4.361817580829795e-05,
"loss": 1.7852,
"step": 1995
},
{
"epoch": 0.4820091765274088,
"grad_norm": 0.32400888204574585,
"learning_rate": 4.358733476649863e-05,
"loss": 1.6627,
"step": 1996
},
{
"epoch": 0.4822506640907993,
"grad_norm": 0.3338795304298401,
"learning_rate": 4.3556491574654335e-05,
"loss": 1.7898,
"step": 1997
},
{
"epoch": 0.4824921516541898,
"grad_norm": 0.3094484508037567,
"learning_rate": 4.352564625125073e-05,
"loss": 1.804,
"step": 1998
},
{
"epoch": 0.4827336392175803,
"grad_norm": 0.312665730714798,
"learning_rate": 4.349479881477473e-05,
"loss": 1.6702,
"step": 1999
},
{
"epoch": 0.48297512678097076,
"grad_norm": 0.3298127055168152,
"learning_rate": 4.3463949283714577e-05,
"loss": 1.7842,
"step": 2000
}
],
"logging_steps": 1.0,
"max_steps": 4141,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.017027157491712e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}