1OV77 / checkpoint-3000 /trainer_state.json
gotzmann's picture
..
33dbfa1
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6230529595015576,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00020768431983385254,
"grad_norm": 17.47925567626953,
"learning_rate": 8.000000000000001e-07,
"loss": 2.1801,
"step": 1
},
{
"epoch": 0.0004153686396677051,
"grad_norm": 21.894914627075195,
"learning_rate": 1.6000000000000001e-06,
"loss": 2.249,
"step": 2
},
{
"epoch": 0.0006230529595015577,
"grad_norm": 25.090269088745117,
"learning_rate": 2.4000000000000003e-06,
"loss": 2.3123,
"step": 3
},
{
"epoch": 0.0008307372793354102,
"grad_norm": 18.762964248657227,
"learning_rate": 3.2000000000000003e-06,
"loss": 2.1709,
"step": 4
},
{
"epoch": 0.0010384215991692627,
"grad_norm": 18.856515884399414,
"learning_rate": 4.000000000000001e-06,
"loss": 2.1823,
"step": 5
},
{
"epoch": 0.0012461059190031153,
"grad_norm": 12.56984806060791,
"learning_rate": 4.800000000000001e-06,
"loss": 2.0798,
"step": 6
},
{
"epoch": 0.0014537902388369677,
"grad_norm": 8.624156951904297,
"learning_rate": 5.600000000000001e-06,
"loss": 1.9833,
"step": 7
},
{
"epoch": 0.0016614745586708203,
"grad_norm": 7.026400089263916,
"learning_rate": 6.4000000000000006e-06,
"loss": 2.0748,
"step": 8
},
{
"epoch": 0.001869158878504673,
"grad_norm": 4.460483074188232,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.9404,
"step": 9
},
{
"epoch": 0.0020768431983385254,
"grad_norm": 3.6536319255828857,
"learning_rate": 8.000000000000001e-06,
"loss": 2.0135,
"step": 10
},
{
"epoch": 0.002284527518172378,
"grad_norm": 2.5768752098083496,
"learning_rate": 8.8e-06,
"loss": 1.8598,
"step": 11
},
{
"epoch": 0.0024922118380062306,
"grad_norm": 2.892141103744507,
"learning_rate": 9.600000000000001e-06,
"loss": 1.8686,
"step": 12
},
{
"epoch": 0.0026998961578400833,
"grad_norm": 2.858818531036377,
"learning_rate": 1.04e-05,
"loss": 1.8634,
"step": 13
},
{
"epoch": 0.0029075804776739354,
"grad_norm": 2.683091163635254,
"learning_rate": 1.1200000000000001e-05,
"loss": 2.0208,
"step": 14
},
{
"epoch": 0.003115264797507788,
"grad_norm": 2.7056639194488525,
"learning_rate": 1.2e-05,
"loss": 1.9497,
"step": 15
},
{
"epoch": 0.0033229491173416407,
"grad_norm": 1.5998775959014893,
"learning_rate": 1.2800000000000001e-05,
"loss": 1.8816,
"step": 16
},
{
"epoch": 0.0035306334371754933,
"grad_norm": 1.1934099197387695,
"learning_rate": 1.3600000000000002e-05,
"loss": 1.8246,
"step": 17
},
{
"epoch": 0.003738317757009346,
"grad_norm": 1.1701968908309937,
"learning_rate": 1.4400000000000001e-05,
"loss": 1.8578,
"step": 18
},
{
"epoch": 0.003946002076843198,
"grad_norm": 1.1254523992538452,
"learning_rate": 1.5200000000000002e-05,
"loss": 1.8861,
"step": 19
},
{
"epoch": 0.004153686396677051,
"grad_norm": 0.8316693305969238,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.7194,
"step": 20
},
{
"epoch": 0.004361370716510903,
"grad_norm": 0.8501921892166138,
"learning_rate": 1.6800000000000002e-05,
"loss": 1.8474,
"step": 21
},
{
"epoch": 0.004569055036344756,
"grad_norm": 0.9284434914588928,
"learning_rate": 1.76e-05,
"loss": 1.9258,
"step": 22
},
{
"epoch": 0.004776739356178609,
"grad_norm": 0.9898029565811157,
"learning_rate": 1.8400000000000003e-05,
"loss": 1.8441,
"step": 23
},
{
"epoch": 0.004984423676012461,
"grad_norm": 0.8852891325950623,
"learning_rate": 1.9200000000000003e-05,
"loss": 1.8521,
"step": 24
},
{
"epoch": 0.005192107995846314,
"grad_norm": 0.9528728723526001,
"learning_rate": 2e-05,
"loss": 1.8485,
"step": 25
},
{
"epoch": 0.0053997923156801665,
"grad_norm": 0.928016185760498,
"learning_rate": 2.08e-05,
"loss": 1.8883,
"step": 26
},
{
"epoch": 0.005607476635514018,
"grad_norm": 0.9373347163200378,
"learning_rate": 2.1600000000000003e-05,
"loss": 1.891,
"step": 27
},
{
"epoch": 0.005815160955347871,
"grad_norm": 0.9534879326820374,
"learning_rate": 2.2400000000000002e-05,
"loss": 1.8144,
"step": 28
},
{
"epoch": 0.0060228452751817235,
"grad_norm": 0.7779879570007324,
"learning_rate": 2.32e-05,
"loss": 1.8153,
"step": 29
},
{
"epoch": 0.006230529595015576,
"grad_norm": 0.8465295433998108,
"learning_rate": 2.4e-05,
"loss": 1.794,
"step": 30
},
{
"epoch": 0.006438213914849429,
"grad_norm": 0.797340452671051,
"learning_rate": 2.4800000000000003e-05,
"loss": 1.8324,
"step": 31
},
{
"epoch": 0.006645898234683281,
"grad_norm": 0.7843337655067444,
"learning_rate": 2.5600000000000002e-05,
"loss": 1.729,
"step": 32
},
{
"epoch": 0.006853582554517134,
"grad_norm": 0.7067154049873352,
"learning_rate": 2.6400000000000005e-05,
"loss": 1.7337,
"step": 33
},
{
"epoch": 0.007061266874350987,
"grad_norm": 0.7495686411857605,
"learning_rate": 2.7200000000000004e-05,
"loss": 1.8074,
"step": 34
},
{
"epoch": 0.007268951194184839,
"grad_norm": 0.7126153707504272,
"learning_rate": 2.8e-05,
"loss": 1.7377,
"step": 35
},
{
"epoch": 0.007476635514018692,
"grad_norm": 0.7612162828445435,
"learning_rate": 2.8800000000000002e-05,
"loss": 1.775,
"step": 36
},
{
"epoch": 0.0076843198338525445,
"grad_norm": 0.7538896799087524,
"learning_rate": 2.96e-05,
"loss": 1.7478,
"step": 37
},
{
"epoch": 0.007892004153686396,
"grad_norm": 0.8245300650596619,
"learning_rate": 3.0400000000000004e-05,
"loss": 1.769,
"step": 38
},
{
"epoch": 0.00809968847352025,
"grad_norm": 0.7681639790534973,
"learning_rate": 3.1200000000000006e-05,
"loss": 1.7789,
"step": 39
},
{
"epoch": 0.008307372793354102,
"grad_norm": 0.7439791560173035,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.8245,
"step": 40
},
{
"epoch": 0.008515057113187955,
"grad_norm": 0.7001054286956787,
"learning_rate": 3.28e-05,
"loss": 1.8516,
"step": 41
},
{
"epoch": 0.008722741433021807,
"grad_norm": 0.7122063040733337,
"learning_rate": 3.3600000000000004e-05,
"loss": 1.7698,
"step": 42
},
{
"epoch": 0.008930425752855659,
"grad_norm": 0.6748693585395813,
"learning_rate": 3.44e-05,
"loss": 1.7525,
"step": 43
},
{
"epoch": 0.009138110072689512,
"grad_norm": 0.7190465927124023,
"learning_rate": 3.52e-05,
"loss": 1.7725,
"step": 44
},
{
"epoch": 0.009345794392523364,
"grad_norm": 0.6779818534851074,
"learning_rate": 3.6e-05,
"loss": 1.7658,
"step": 45
},
{
"epoch": 0.009553478712357217,
"grad_norm": 0.6692732572555542,
"learning_rate": 3.680000000000001e-05,
"loss": 1.698,
"step": 46
},
{
"epoch": 0.009761163032191069,
"grad_norm": 0.6866817474365234,
"learning_rate": 3.76e-05,
"loss": 1.8105,
"step": 47
},
{
"epoch": 0.009968847352024923,
"grad_norm": 0.6470504999160767,
"learning_rate": 3.8400000000000005e-05,
"loss": 1.6455,
"step": 48
},
{
"epoch": 0.010176531671858774,
"grad_norm": 0.681182324886322,
"learning_rate": 3.9200000000000004e-05,
"loss": 1.6234,
"step": 49
},
{
"epoch": 0.010384215991692628,
"grad_norm": 0.6945406198501587,
"learning_rate": 4e-05,
"loss": 1.8115,
"step": 50
},
{
"epoch": 0.01059190031152648,
"grad_norm": 0.635296642780304,
"learning_rate": 4.08e-05,
"loss": 1.752,
"step": 51
},
{
"epoch": 0.010799584631360333,
"grad_norm": 0.6995029449462891,
"learning_rate": 4.16e-05,
"loss": 1.7869,
"step": 52
},
{
"epoch": 0.011007268951194185,
"grad_norm": 0.6692876815795898,
"learning_rate": 4.240000000000001e-05,
"loss": 1.7784,
"step": 53
},
{
"epoch": 0.011214953271028037,
"grad_norm": 0.6663792133331299,
"learning_rate": 4.3200000000000007e-05,
"loss": 1.7742,
"step": 54
},
{
"epoch": 0.01142263759086189,
"grad_norm": 0.6469627022743225,
"learning_rate": 4.4000000000000006e-05,
"loss": 1.7418,
"step": 55
},
{
"epoch": 0.011630321910695742,
"grad_norm": 0.6327781677246094,
"learning_rate": 4.4800000000000005e-05,
"loss": 1.7479,
"step": 56
},
{
"epoch": 0.011838006230529595,
"grad_norm": 0.6531626582145691,
"learning_rate": 4.56e-05,
"loss": 1.7121,
"step": 57
},
{
"epoch": 0.012045690550363447,
"grad_norm": 0.6819879412651062,
"learning_rate": 4.64e-05,
"loss": 1.8296,
"step": 58
},
{
"epoch": 0.0122533748701973,
"grad_norm": 0.6116477847099304,
"learning_rate": 4.72e-05,
"loss": 1.6922,
"step": 59
},
{
"epoch": 0.012461059190031152,
"grad_norm": 0.6569270491600037,
"learning_rate": 4.8e-05,
"loss": 1.7661,
"step": 60
},
{
"epoch": 0.012668743509865006,
"grad_norm": 0.6796997785568237,
"learning_rate": 4.88e-05,
"loss": 1.7959,
"step": 61
},
{
"epoch": 0.012876427829698858,
"grad_norm": 0.7139971256256104,
"learning_rate": 4.9600000000000006e-05,
"loss": 1.7764,
"step": 62
},
{
"epoch": 0.013084112149532711,
"grad_norm": 0.7374599575996399,
"learning_rate": 5.0400000000000005e-05,
"loss": 1.7271,
"step": 63
},
{
"epoch": 0.013291796469366563,
"grad_norm": 0.6610848307609558,
"learning_rate": 5.1200000000000004e-05,
"loss": 1.8526,
"step": 64
},
{
"epoch": 0.013499480789200415,
"grad_norm": 0.742107093334198,
"learning_rate": 5.2000000000000004e-05,
"loss": 1.754,
"step": 65
},
{
"epoch": 0.013707165109034268,
"grad_norm": 0.6820899844169617,
"learning_rate": 5.280000000000001e-05,
"loss": 1.7355,
"step": 66
},
{
"epoch": 0.01391484942886812,
"grad_norm": 0.649463951587677,
"learning_rate": 5.360000000000001e-05,
"loss": 1.6453,
"step": 67
},
{
"epoch": 0.014122533748701973,
"grad_norm": 0.7077696323394775,
"learning_rate": 5.440000000000001e-05,
"loss": 1.7962,
"step": 68
},
{
"epoch": 0.014330218068535825,
"grad_norm": 0.6177412867546082,
"learning_rate": 5.52e-05,
"loss": 1.6742,
"step": 69
},
{
"epoch": 0.014537902388369679,
"grad_norm": 0.7839769721031189,
"learning_rate": 5.6e-05,
"loss": 1.8364,
"step": 70
},
{
"epoch": 0.01474558670820353,
"grad_norm": 0.6374268531799316,
"learning_rate": 5.6800000000000005e-05,
"loss": 1.7423,
"step": 71
},
{
"epoch": 0.014953271028037384,
"grad_norm": 0.6933043599128723,
"learning_rate": 5.7600000000000004e-05,
"loss": 1.7669,
"step": 72
},
{
"epoch": 0.015160955347871236,
"grad_norm": 0.641764760017395,
"learning_rate": 5.84e-05,
"loss": 1.7328,
"step": 73
},
{
"epoch": 0.015368639667705089,
"grad_norm": 0.6187850832939148,
"learning_rate": 5.92e-05,
"loss": 1.6375,
"step": 74
},
{
"epoch": 0.01557632398753894,
"grad_norm": 0.6676532030105591,
"learning_rate": 6.000000000000001e-05,
"loss": 1.7784,
"step": 75
},
{
"epoch": 0.015784008307372793,
"grad_norm": 0.6391186118125916,
"learning_rate": 6.080000000000001e-05,
"loss": 1.6554,
"step": 76
},
{
"epoch": 0.015991692627206646,
"grad_norm": 0.6301543116569519,
"learning_rate": 6.16e-05,
"loss": 1.9057,
"step": 77
},
{
"epoch": 0.0161993769470405,
"grad_norm": 0.6136651039123535,
"learning_rate": 6.240000000000001e-05,
"loss": 1.6371,
"step": 78
},
{
"epoch": 0.01640706126687435,
"grad_norm": 0.6152656674385071,
"learning_rate": 6.32e-05,
"loss": 1.7339,
"step": 79
},
{
"epoch": 0.016614745586708203,
"grad_norm": 0.6085280179977417,
"learning_rate": 6.400000000000001e-05,
"loss": 1.7106,
"step": 80
},
{
"epoch": 0.016822429906542057,
"grad_norm": 0.6468527913093567,
"learning_rate": 6.48e-05,
"loss": 1.7993,
"step": 81
},
{
"epoch": 0.01703011422637591,
"grad_norm": 0.6175693273544312,
"learning_rate": 6.56e-05,
"loss": 1.7434,
"step": 82
},
{
"epoch": 0.01723779854620976,
"grad_norm": 0.5871699452400208,
"learning_rate": 6.64e-05,
"loss": 1.6838,
"step": 83
},
{
"epoch": 0.017445482866043614,
"grad_norm": 0.6958571672439575,
"learning_rate": 6.720000000000001e-05,
"loss": 1.7389,
"step": 84
},
{
"epoch": 0.017653167185877467,
"grad_norm": 0.5976506471633911,
"learning_rate": 6.8e-05,
"loss": 1.7073,
"step": 85
},
{
"epoch": 0.017860851505711317,
"grad_norm": 0.6843888163566589,
"learning_rate": 6.88e-05,
"loss": 1.7339,
"step": 86
},
{
"epoch": 0.01806853582554517,
"grad_norm": 0.5901737213134766,
"learning_rate": 6.960000000000001e-05,
"loss": 1.7577,
"step": 87
},
{
"epoch": 0.018276220145379024,
"grad_norm": 0.6460241675376892,
"learning_rate": 7.04e-05,
"loss": 1.7482,
"step": 88
},
{
"epoch": 0.018483904465212878,
"grad_norm": 0.6465390920639038,
"learning_rate": 7.120000000000001e-05,
"loss": 1.8085,
"step": 89
},
{
"epoch": 0.018691588785046728,
"grad_norm": 0.6187453866004944,
"learning_rate": 7.2e-05,
"loss": 1.7507,
"step": 90
},
{
"epoch": 0.01889927310488058,
"grad_norm": 0.6101999282836914,
"learning_rate": 7.280000000000001e-05,
"loss": 1.7488,
"step": 91
},
{
"epoch": 0.019106957424714435,
"grad_norm": 0.5893586874008179,
"learning_rate": 7.360000000000001e-05,
"loss": 1.6846,
"step": 92
},
{
"epoch": 0.019314641744548288,
"grad_norm": 0.6104580163955688,
"learning_rate": 7.44e-05,
"loss": 1.6573,
"step": 93
},
{
"epoch": 0.019522326064382138,
"grad_norm": 0.6451858878135681,
"learning_rate": 7.52e-05,
"loss": 1.8052,
"step": 94
},
{
"epoch": 0.01973001038421599,
"grad_norm": 0.5865317583084106,
"learning_rate": 7.6e-05,
"loss": 1.6534,
"step": 95
},
{
"epoch": 0.019937694704049845,
"grad_norm": 0.6278175711631775,
"learning_rate": 7.680000000000001e-05,
"loss": 1.6898,
"step": 96
},
{
"epoch": 0.020145379023883695,
"grad_norm": 0.5943008661270142,
"learning_rate": 7.76e-05,
"loss": 1.7049,
"step": 97
},
{
"epoch": 0.02035306334371755,
"grad_norm": 0.6195578575134277,
"learning_rate": 7.840000000000001e-05,
"loss": 1.7527,
"step": 98
},
{
"epoch": 0.020560747663551402,
"grad_norm": 0.6871982216835022,
"learning_rate": 7.92e-05,
"loss": 1.708,
"step": 99
},
{
"epoch": 0.020768431983385256,
"grad_norm": 0.6266918182373047,
"learning_rate": 8e-05,
"loss": 1.781,
"step": 100
},
{
"epoch": 0.020976116303219106,
"grad_norm": 0.7398074269294739,
"learning_rate": 8e-05,
"loss": 1.7166,
"step": 101
},
{
"epoch": 0.02118380062305296,
"grad_norm": 0.6563434600830078,
"learning_rate": 8e-05,
"loss": 1.8624,
"step": 102
},
{
"epoch": 0.021391484942886813,
"grad_norm": 0.7264277338981628,
"learning_rate": 8e-05,
"loss": 1.7253,
"step": 103
},
{
"epoch": 0.021599169262720666,
"grad_norm": 0.6289424896240234,
"learning_rate": 8e-05,
"loss": 1.7794,
"step": 104
},
{
"epoch": 0.021806853582554516,
"grad_norm": 0.6097893714904785,
"learning_rate": 8e-05,
"loss": 1.7288,
"step": 105
},
{
"epoch": 0.02201453790238837,
"grad_norm": 0.5839325785636902,
"learning_rate": 8e-05,
"loss": 1.6222,
"step": 106
},
{
"epoch": 0.022222222222222223,
"grad_norm": 0.5372262001037598,
"learning_rate": 8e-05,
"loss": 1.6274,
"step": 107
},
{
"epoch": 0.022429906542056073,
"grad_norm": 0.5895355343818665,
"learning_rate": 8e-05,
"loss": 1.7382,
"step": 108
},
{
"epoch": 0.022637590861889927,
"grad_norm": 0.6392791867256165,
"learning_rate": 8e-05,
"loss": 1.7517,
"step": 109
},
{
"epoch": 0.02284527518172378,
"grad_norm": 0.6434172987937927,
"learning_rate": 8e-05,
"loss": 1.7384,
"step": 110
},
{
"epoch": 0.023052959501557634,
"grad_norm": 0.5970573425292969,
"learning_rate": 8e-05,
"loss": 1.7531,
"step": 111
},
{
"epoch": 0.023260643821391484,
"grad_norm": 0.5773294568061829,
"learning_rate": 8e-05,
"loss": 1.6521,
"step": 112
},
{
"epoch": 0.023468328141225337,
"grad_norm": 0.6489471197128296,
"learning_rate": 8e-05,
"loss": 1.7838,
"step": 113
},
{
"epoch": 0.02367601246105919,
"grad_norm": 0.6172572374343872,
"learning_rate": 8e-05,
"loss": 1.7552,
"step": 114
},
{
"epoch": 0.023883696780893044,
"grad_norm": 0.5731709003448486,
"learning_rate": 8e-05,
"loss": 1.6842,
"step": 115
},
{
"epoch": 0.024091381100726894,
"grad_norm": 0.5704280138015747,
"learning_rate": 8e-05,
"loss": 1.7002,
"step": 116
},
{
"epoch": 0.024299065420560748,
"grad_norm": 0.5774348378181458,
"learning_rate": 8e-05,
"loss": 1.7487,
"step": 117
},
{
"epoch": 0.0245067497403946,
"grad_norm": 0.5606811046600342,
"learning_rate": 8e-05,
"loss": 1.6928,
"step": 118
},
{
"epoch": 0.02471443406022845,
"grad_norm": 0.5781872868537903,
"learning_rate": 8e-05,
"loss": 1.7106,
"step": 119
},
{
"epoch": 0.024922118380062305,
"grad_norm": 0.5436711311340332,
"learning_rate": 8e-05,
"loss": 1.6909,
"step": 120
},
{
"epoch": 0.025129802699896158,
"grad_norm": 0.5731891393661499,
"learning_rate": 8e-05,
"loss": 1.6783,
"step": 121
},
{
"epoch": 0.02533748701973001,
"grad_norm": 0.5782831907272339,
"learning_rate": 8e-05,
"loss": 1.715,
"step": 122
},
{
"epoch": 0.02554517133956386,
"grad_norm": 0.6293315291404724,
"learning_rate": 8e-05,
"loss": 1.7052,
"step": 123
},
{
"epoch": 0.025752855659397715,
"grad_norm": 0.5766347646713257,
"learning_rate": 8e-05,
"loss": 1.7466,
"step": 124
},
{
"epoch": 0.02596053997923157,
"grad_norm": 0.6577507257461548,
"learning_rate": 8e-05,
"loss": 1.7412,
"step": 125
},
{
"epoch": 0.026168224299065422,
"grad_norm": 0.5595094561576843,
"learning_rate": 8e-05,
"loss": 1.7464,
"step": 126
},
{
"epoch": 0.026375908618899272,
"grad_norm": 0.5962421894073486,
"learning_rate": 8e-05,
"loss": 1.6721,
"step": 127
},
{
"epoch": 0.026583592938733126,
"grad_norm": 0.5711417198181152,
"learning_rate": 8e-05,
"loss": 1.7608,
"step": 128
},
{
"epoch": 0.02679127725856698,
"grad_norm": 0.6157316565513611,
"learning_rate": 8e-05,
"loss": 1.7212,
"step": 129
},
{
"epoch": 0.02699896157840083,
"grad_norm": 0.5418056845664978,
"learning_rate": 8e-05,
"loss": 1.6652,
"step": 130
},
{
"epoch": 0.027206645898234683,
"grad_norm": 0.5753270983695984,
"learning_rate": 8e-05,
"loss": 1.7269,
"step": 131
},
{
"epoch": 0.027414330218068536,
"grad_norm": 0.5521407127380371,
"learning_rate": 8e-05,
"loss": 1.7053,
"step": 132
},
{
"epoch": 0.02762201453790239,
"grad_norm": 0.5795088410377502,
"learning_rate": 8e-05,
"loss": 1.8084,
"step": 133
},
{
"epoch": 0.02782969885773624,
"grad_norm": 0.5171025991439819,
"learning_rate": 8e-05,
"loss": 1.6639,
"step": 134
},
{
"epoch": 0.028037383177570093,
"grad_norm": 0.5902402997016907,
"learning_rate": 8e-05,
"loss": 1.6855,
"step": 135
},
{
"epoch": 0.028245067497403947,
"grad_norm": 0.535606861114502,
"learning_rate": 8e-05,
"loss": 1.7269,
"step": 136
},
{
"epoch": 0.0284527518172378,
"grad_norm": 0.61818528175354,
"learning_rate": 8e-05,
"loss": 1.7777,
"step": 137
},
{
"epoch": 0.02866043613707165,
"grad_norm": 0.5806450843811035,
"learning_rate": 8e-05,
"loss": 1.738,
"step": 138
},
{
"epoch": 0.028868120456905504,
"grad_norm": 0.5649462342262268,
"learning_rate": 8e-05,
"loss": 1.6814,
"step": 139
},
{
"epoch": 0.029075804776739357,
"grad_norm": 0.5286867022514343,
"learning_rate": 8e-05,
"loss": 1.6813,
"step": 140
},
{
"epoch": 0.029283489096573207,
"grad_norm": 0.5503925085067749,
"learning_rate": 8e-05,
"loss": 1.6656,
"step": 141
},
{
"epoch": 0.02949117341640706,
"grad_norm": 0.5422264933586121,
"learning_rate": 8e-05,
"loss": 1.6465,
"step": 142
},
{
"epoch": 0.029698857736240914,
"grad_norm": 0.5629477500915527,
"learning_rate": 8e-05,
"loss": 1.7515,
"step": 143
},
{
"epoch": 0.029906542056074768,
"grad_norm": 0.5476765036582947,
"learning_rate": 8e-05,
"loss": 1.7234,
"step": 144
},
{
"epoch": 0.030114226375908618,
"grad_norm": 0.5577101111412048,
"learning_rate": 8e-05,
"loss": 1.6463,
"step": 145
},
{
"epoch": 0.03032191069574247,
"grad_norm": 0.566056489944458,
"learning_rate": 8e-05,
"loss": 1.7592,
"step": 146
},
{
"epoch": 0.030529595015576325,
"grad_norm": 0.5325253009796143,
"learning_rate": 8e-05,
"loss": 1.7202,
"step": 147
},
{
"epoch": 0.030737279335410178,
"grad_norm": 0.569216787815094,
"learning_rate": 8e-05,
"loss": 1.7454,
"step": 148
},
{
"epoch": 0.030944963655244028,
"grad_norm": 0.5541402101516724,
"learning_rate": 8e-05,
"loss": 1.5909,
"step": 149
},
{
"epoch": 0.03115264797507788,
"grad_norm": 0.5762909650802612,
"learning_rate": 8e-05,
"loss": 1.7057,
"step": 150
},
{
"epoch": 0.03136033229491173,
"grad_norm": 0.5259131193161011,
"learning_rate": 8e-05,
"loss": 1.624,
"step": 151
},
{
"epoch": 0.031568016614745585,
"grad_norm": 0.5526155233383179,
"learning_rate": 8e-05,
"loss": 1.6645,
"step": 152
},
{
"epoch": 0.03177570093457944,
"grad_norm": 0.5420238375663757,
"learning_rate": 8e-05,
"loss": 1.6899,
"step": 153
},
{
"epoch": 0.03198338525441329,
"grad_norm": 0.5291162133216858,
"learning_rate": 8e-05,
"loss": 1.7062,
"step": 154
},
{
"epoch": 0.032191069574247146,
"grad_norm": 0.5605945587158203,
"learning_rate": 8e-05,
"loss": 1.7297,
"step": 155
},
{
"epoch": 0.032398753894081,
"grad_norm": 0.5243666768074036,
"learning_rate": 8e-05,
"loss": 1.6641,
"step": 156
},
{
"epoch": 0.03260643821391485,
"grad_norm": 0.5971183180809021,
"learning_rate": 8e-05,
"loss": 1.6927,
"step": 157
},
{
"epoch": 0.0328141225337487,
"grad_norm": 0.5431556105613708,
"learning_rate": 8e-05,
"loss": 1.6636,
"step": 158
},
{
"epoch": 0.03302180685358255,
"grad_norm": 0.5140684247016907,
"learning_rate": 8e-05,
"loss": 1.6315,
"step": 159
},
{
"epoch": 0.033229491173416406,
"grad_norm": 0.518284022808075,
"learning_rate": 8e-05,
"loss": 1.6613,
"step": 160
},
{
"epoch": 0.03343717549325026,
"grad_norm": 0.5419729948043823,
"learning_rate": 8e-05,
"loss": 1.7693,
"step": 161
},
{
"epoch": 0.03364485981308411,
"grad_norm": 0.5171647667884827,
"learning_rate": 8e-05,
"loss": 1.6841,
"step": 162
},
{
"epoch": 0.03385254413291797,
"grad_norm": 0.5692399740219116,
"learning_rate": 8e-05,
"loss": 1.7286,
"step": 163
},
{
"epoch": 0.03406022845275182,
"grad_norm": 0.5294033288955688,
"learning_rate": 8e-05,
"loss": 1.7233,
"step": 164
},
{
"epoch": 0.03426791277258567,
"grad_norm": 0.5355773568153381,
"learning_rate": 8e-05,
"loss": 1.748,
"step": 165
},
{
"epoch": 0.03447559709241952,
"grad_norm": 0.5389161705970764,
"learning_rate": 8e-05,
"loss": 1.7231,
"step": 166
},
{
"epoch": 0.034683281412253374,
"grad_norm": 0.5242868065834045,
"learning_rate": 8e-05,
"loss": 1.759,
"step": 167
},
{
"epoch": 0.03489096573208723,
"grad_norm": 0.5554731488227844,
"learning_rate": 8e-05,
"loss": 1.7271,
"step": 168
},
{
"epoch": 0.03509865005192108,
"grad_norm": 0.603803277015686,
"learning_rate": 8e-05,
"loss": 1.7537,
"step": 169
},
{
"epoch": 0.035306334371754934,
"grad_norm": 0.5199013948440552,
"learning_rate": 8e-05,
"loss": 1.6379,
"step": 170
},
{
"epoch": 0.03551401869158879,
"grad_norm": 0.5927119851112366,
"learning_rate": 8e-05,
"loss": 1.6706,
"step": 171
},
{
"epoch": 0.035721703011422634,
"grad_norm": 0.5246160626411438,
"learning_rate": 8e-05,
"loss": 1.6838,
"step": 172
},
{
"epoch": 0.03592938733125649,
"grad_norm": 0.568761944770813,
"learning_rate": 8e-05,
"loss": 1.6608,
"step": 173
},
{
"epoch": 0.03613707165109034,
"grad_norm": 0.5335773825645447,
"learning_rate": 8e-05,
"loss": 1.6861,
"step": 174
},
{
"epoch": 0.036344755970924195,
"grad_norm": 0.5661566257476807,
"learning_rate": 8e-05,
"loss": 1.7069,
"step": 175
},
{
"epoch": 0.03655244029075805,
"grad_norm": 0.550425112247467,
"learning_rate": 8e-05,
"loss": 1.7824,
"step": 176
},
{
"epoch": 0.0367601246105919,
"grad_norm": 0.5691404938697815,
"learning_rate": 8e-05,
"loss": 1.7234,
"step": 177
},
{
"epoch": 0.036967808930425755,
"grad_norm": 0.5029447078704834,
"learning_rate": 8e-05,
"loss": 1.592,
"step": 178
},
{
"epoch": 0.03717549325025961,
"grad_norm": 0.5434035062789917,
"learning_rate": 8e-05,
"loss": 1.76,
"step": 179
},
{
"epoch": 0.037383177570093455,
"grad_norm": 0.5171444416046143,
"learning_rate": 8e-05,
"loss": 1.6538,
"step": 180
},
{
"epoch": 0.03759086188992731,
"grad_norm": 0.5830038785934448,
"learning_rate": 8e-05,
"loss": 1.8091,
"step": 181
},
{
"epoch": 0.03779854620976116,
"grad_norm": 0.5217128992080688,
"learning_rate": 8e-05,
"loss": 1.7283,
"step": 182
},
{
"epoch": 0.038006230529595016,
"grad_norm": 0.6341305375099182,
"learning_rate": 8e-05,
"loss": 1.7947,
"step": 183
},
{
"epoch": 0.03821391484942887,
"grad_norm": 0.5161294341087341,
"learning_rate": 8e-05,
"loss": 1.6634,
"step": 184
},
{
"epoch": 0.03842159916926272,
"grad_norm": 0.6006560325622559,
"learning_rate": 8e-05,
"loss": 1.6709,
"step": 185
},
{
"epoch": 0.038629283489096576,
"grad_norm": 0.5234457850456238,
"learning_rate": 8e-05,
"loss": 1.695,
"step": 186
},
{
"epoch": 0.03883696780893042,
"grad_norm": 0.5401941537857056,
"learning_rate": 8e-05,
"loss": 1.6625,
"step": 187
},
{
"epoch": 0.039044652128764276,
"grad_norm": 0.5211032032966614,
"learning_rate": 8e-05,
"loss": 1.6295,
"step": 188
},
{
"epoch": 0.03925233644859813,
"grad_norm": 0.5172737240791321,
"learning_rate": 8e-05,
"loss": 1.5994,
"step": 189
},
{
"epoch": 0.03946002076843198,
"grad_norm": 0.5083449482917786,
"learning_rate": 8e-05,
"loss": 1.6803,
"step": 190
},
{
"epoch": 0.03966770508826584,
"grad_norm": 0.5440491437911987,
"learning_rate": 8e-05,
"loss": 1.7009,
"step": 191
},
{
"epoch": 0.03987538940809969,
"grad_norm": 0.6146543622016907,
"learning_rate": 8e-05,
"loss": 1.7213,
"step": 192
},
{
"epoch": 0.040083073727933544,
"grad_norm": 0.5229283571243286,
"learning_rate": 8e-05,
"loss": 1.6891,
"step": 193
},
{
"epoch": 0.04029075804776739,
"grad_norm": 0.6130139827728271,
"learning_rate": 8e-05,
"loss": 1.7485,
"step": 194
},
{
"epoch": 0.040498442367601244,
"grad_norm": 0.5257254242897034,
"learning_rate": 8e-05,
"loss": 1.6668,
"step": 195
},
{
"epoch": 0.0407061266874351,
"grad_norm": 0.5981298089027405,
"learning_rate": 8e-05,
"loss": 1.7177,
"step": 196
},
{
"epoch": 0.04091381100726895,
"grad_norm": 0.48841404914855957,
"learning_rate": 8e-05,
"loss": 1.6452,
"step": 197
},
{
"epoch": 0.041121495327102804,
"grad_norm": 0.6227656006813049,
"learning_rate": 8e-05,
"loss": 1.6978,
"step": 198
},
{
"epoch": 0.04132917964693666,
"grad_norm": 0.5224297046661377,
"learning_rate": 8e-05,
"loss": 1.7488,
"step": 199
},
{
"epoch": 0.04153686396677051,
"grad_norm": 0.6549935936927795,
"learning_rate": 8e-05,
"loss": 1.759,
"step": 200
},
{
"epoch": 0.041744548286604365,
"grad_norm": 0.5142560601234436,
"learning_rate": 8e-05,
"loss": 1.6678,
"step": 201
},
{
"epoch": 0.04195223260643821,
"grad_norm": 0.53005051612854,
"learning_rate": 8e-05,
"loss": 1.6766,
"step": 202
},
{
"epoch": 0.042159916926272065,
"grad_norm": 0.5193365812301636,
"learning_rate": 8e-05,
"loss": 1.6739,
"step": 203
},
{
"epoch": 0.04236760124610592,
"grad_norm": 0.5400128364562988,
"learning_rate": 8e-05,
"loss": 1.7213,
"step": 204
},
{
"epoch": 0.04257528556593977,
"grad_norm": 0.5410746932029724,
"learning_rate": 8e-05,
"loss": 1.721,
"step": 205
},
{
"epoch": 0.042782969885773625,
"grad_norm": 0.4970126450061798,
"learning_rate": 8e-05,
"loss": 1.6326,
"step": 206
},
{
"epoch": 0.04299065420560748,
"grad_norm": 0.5962429642677307,
"learning_rate": 8e-05,
"loss": 1.7096,
"step": 207
},
{
"epoch": 0.04319833852544133,
"grad_norm": 0.5132887363433838,
"learning_rate": 8e-05,
"loss": 1.7047,
"step": 208
},
{
"epoch": 0.04340602284527518,
"grad_norm": 0.5727942585945129,
"learning_rate": 8e-05,
"loss": 1.6485,
"step": 209
},
{
"epoch": 0.04361370716510903,
"grad_norm": 0.516907811164856,
"learning_rate": 8e-05,
"loss": 1.693,
"step": 210
},
{
"epoch": 0.043821391484942886,
"grad_norm": 0.5103829503059387,
"learning_rate": 8e-05,
"loss": 1.6644,
"step": 211
},
{
"epoch": 0.04402907580477674,
"grad_norm": 0.5416673421859741,
"learning_rate": 8e-05,
"loss": 1.7322,
"step": 212
},
{
"epoch": 0.04423676012461059,
"grad_norm": 0.4991842806339264,
"learning_rate": 8e-05,
"loss": 1.6948,
"step": 213
},
{
"epoch": 0.044444444444444446,
"grad_norm": 0.5187154412269592,
"learning_rate": 8e-05,
"loss": 1.5997,
"step": 214
},
{
"epoch": 0.0446521287642783,
"grad_norm": 0.5057274699211121,
"learning_rate": 8e-05,
"loss": 1.7124,
"step": 215
},
{
"epoch": 0.044859813084112146,
"grad_norm": 0.522167980670929,
"learning_rate": 8e-05,
"loss": 1.646,
"step": 216
},
{
"epoch": 0.045067497403946,
"grad_norm": 0.5210246443748474,
"learning_rate": 8e-05,
"loss": 1.7006,
"step": 217
},
{
"epoch": 0.04527518172377985,
"grad_norm": 0.5107594728469849,
"learning_rate": 8e-05,
"loss": 1.6886,
"step": 218
},
{
"epoch": 0.04548286604361371,
"grad_norm": 0.546195387840271,
"learning_rate": 8e-05,
"loss": 1.6922,
"step": 219
},
{
"epoch": 0.04569055036344756,
"grad_norm": 0.5224754214286804,
"learning_rate": 8e-05,
"loss": 1.7163,
"step": 220
},
{
"epoch": 0.045898234683281414,
"grad_norm": 0.538343071937561,
"learning_rate": 8e-05,
"loss": 1.6758,
"step": 221
},
{
"epoch": 0.04610591900311527,
"grad_norm": 0.5105881094932556,
"learning_rate": 8e-05,
"loss": 1.7446,
"step": 222
},
{
"epoch": 0.04631360332294912,
"grad_norm": 0.5250230431556702,
"learning_rate": 8e-05,
"loss": 1.7269,
"step": 223
},
{
"epoch": 0.04652128764278297,
"grad_norm": 0.5123667120933533,
"learning_rate": 8e-05,
"loss": 1.6593,
"step": 224
},
{
"epoch": 0.04672897196261682,
"grad_norm": 0.5394571423530579,
"learning_rate": 8e-05,
"loss": 1.6677,
"step": 225
},
{
"epoch": 0.046936656282450674,
"grad_norm": 0.5013480186462402,
"learning_rate": 8e-05,
"loss": 1.6936,
"step": 226
},
{
"epoch": 0.04714434060228453,
"grad_norm": 0.501785933971405,
"learning_rate": 8e-05,
"loss": 1.5588,
"step": 227
},
{
"epoch": 0.04735202492211838,
"grad_norm": 0.5234807729721069,
"learning_rate": 8e-05,
"loss": 1.7372,
"step": 228
},
{
"epoch": 0.047559709241952235,
"grad_norm": 0.5221506357192993,
"learning_rate": 8e-05,
"loss": 1.6345,
"step": 229
},
{
"epoch": 0.04776739356178609,
"grad_norm": 0.5281847715377808,
"learning_rate": 8e-05,
"loss": 1.6074,
"step": 230
},
{
"epoch": 0.047975077881619935,
"grad_norm": 0.5333276987075806,
"learning_rate": 8e-05,
"loss": 1.6483,
"step": 231
},
{
"epoch": 0.04818276220145379,
"grad_norm": 0.49470800161361694,
"learning_rate": 8e-05,
"loss": 1.6387,
"step": 232
},
{
"epoch": 0.04839044652128764,
"grad_norm": 0.5513780117034912,
"learning_rate": 8e-05,
"loss": 1.6956,
"step": 233
},
{
"epoch": 0.048598130841121495,
"grad_norm": 0.48948371410369873,
"learning_rate": 8e-05,
"loss": 1.6548,
"step": 234
},
{
"epoch": 0.04880581516095535,
"grad_norm": 0.5753876566886902,
"learning_rate": 8e-05,
"loss": 1.7143,
"step": 235
},
{
"epoch": 0.0490134994807892,
"grad_norm": 0.5774050951004028,
"learning_rate": 8e-05,
"loss": 1.6995,
"step": 236
},
{
"epoch": 0.049221183800623056,
"grad_norm": 0.5836580395698547,
"learning_rate": 8e-05,
"loss": 1.75,
"step": 237
},
{
"epoch": 0.0494288681204569,
"grad_norm": 0.6421523094177246,
"learning_rate": 8e-05,
"loss": 1.7579,
"step": 238
},
{
"epoch": 0.049636552440290756,
"grad_norm": 0.5229347944259644,
"learning_rate": 8e-05,
"loss": 1.6735,
"step": 239
},
{
"epoch": 0.04984423676012461,
"grad_norm": 0.6658037304878235,
"learning_rate": 8e-05,
"loss": 1.7808,
"step": 240
},
{
"epoch": 0.05005192107995846,
"grad_norm": 0.5098143219947815,
"learning_rate": 8e-05,
"loss": 1.6961,
"step": 241
},
{
"epoch": 0.050259605399792316,
"grad_norm": 0.5492331385612488,
"learning_rate": 8e-05,
"loss": 1.6589,
"step": 242
},
{
"epoch": 0.05046728971962617,
"grad_norm": 0.5122767686843872,
"learning_rate": 8e-05,
"loss": 1.688,
"step": 243
},
{
"epoch": 0.05067497403946002,
"grad_norm": 0.5612162351608276,
"learning_rate": 8e-05,
"loss": 1.7156,
"step": 244
},
{
"epoch": 0.05088265835929388,
"grad_norm": 0.5456447601318359,
"learning_rate": 8e-05,
"loss": 1.668,
"step": 245
},
{
"epoch": 0.05109034267912772,
"grad_norm": 0.5987516641616821,
"learning_rate": 8e-05,
"loss": 1.7434,
"step": 246
},
{
"epoch": 0.05129802699896158,
"grad_norm": 0.5334711074829102,
"learning_rate": 8e-05,
"loss": 1.6684,
"step": 247
},
{
"epoch": 0.05150571131879543,
"grad_norm": 0.5441988706588745,
"learning_rate": 8e-05,
"loss": 1.667,
"step": 248
},
{
"epoch": 0.051713395638629284,
"grad_norm": 0.5460416078567505,
"learning_rate": 8e-05,
"loss": 1.6615,
"step": 249
},
{
"epoch": 0.05192107995846314,
"grad_norm": 0.5425359010696411,
"learning_rate": 8e-05,
"loss": 1.6637,
"step": 250
},
{
"epoch": 0.05212876427829699,
"grad_norm": 0.5430195927619934,
"learning_rate": 8e-05,
"loss": 1.6521,
"step": 251
},
{
"epoch": 0.052336448598130844,
"grad_norm": 0.4934813678264618,
"learning_rate": 8e-05,
"loss": 1.7507,
"step": 252
},
{
"epoch": 0.05254413291796469,
"grad_norm": 0.5606212019920349,
"learning_rate": 8e-05,
"loss": 1.754,
"step": 253
},
{
"epoch": 0.052751817237798544,
"grad_norm": 0.5214331746101379,
"learning_rate": 8e-05,
"loss": 1.676,
"step": 254
},
{
"epoch": 0.0529595015576324,
"grad_norm": 0.60882568359375,
"learning_rate": 8e-05,
"loss": 1.7517,
"step": 255
},
{
"epoch": 0.05316718587746625,
"grad_norm": 0.5168995261192322,
"learning_rate": 8e-05,
"loss": 1.6832,
"step": 256
},
{
"epoch": 0.053374870197300105,
"grad_norm": 0.5288822650909424,
"learning_rate": 8e-05,
"loss": 1.6817,
"step": 257
},
{
"epoch": 0.05358255451713396,
"grad_norm": 0.563677966594696,
"learning_rate": 8e-05,
"loss": 1.7,
"step": 258
},
{
"epoch": 0.05379023883696781,
"grad_norm": 0.5071162581443787,
"learning_rate": 8e-05,
"loss": 1.72,
"step": 259
},
{
"epoch": 0.05399792315680166,
"grad_norm": 0.5226413607597351,
"learning_rate": 8e-05,
"loss": 1.6503,
"step": 260
},
{
"epoch": 0.05420560747663551,
"grad_norm": 0.5036090016365051,
"learning_rate": 8e-05,
"loss": 1.671,
"step": 261
},
{
"epoch": 0.054413291796469365,
"grad_norm": 0.5291279554367065,
"learning_rate": 8e-05,
"loss": 1.6547,
"step": 262
},
{
"epoch": 0.05462097611630322,
"grad_norm": 0.5152414441108704,
"learning_rate": 8e-05,
"loss": 1.7235,
"step": 263
},
{
"epoch": 0.05482866043613707,
"grad_norm": 0.5221415758132935,
"learning_rate": 8e-05,
"loss": 1.7344,
"step": 264
},
{
"epoch": 0.055036344755970926,
"grad_norm": 0.5111775398254395,
"learning_rate": 8e-05,
"loss": 1.6625,
"step": 265
},
{
"epoch": 0.05524402907580478,
"grad_norm": 0.49026650190353394,
"learning_rate": 8e-05,
"loss": 1.626,
"step": 266
},
{
"epoch": 0.05545171339563863,
"grad_norm": 0.5113672614097595,
"learning_rate": 8e-05,
"loss": 1.7152,
"step": 267
},
{
"epoch": 0.05565939771547248,
"grad_norm": 0.5143501162528992,
"learning_rate": 8e-05,
"loss": 1.6065,
"step": 268
},
{
"epoch": 0.05586708203530633,
"grad_norm": 0.548305094242096,
"learning_rate": 8e-05,
"loss": 1.7169,
"step": 269
},
{
"epoch": 0.056074766355140186,
"grad_norm": 0.5203267931938171,
"learning_rate": 8e-05,
"loss": 1.7894,
"step": 270
},
{
"epoch": 0.05628245067497404,
"grad_norm": 0.5775648951530457,
"learning_rate": 8e-05,
"loss": 1.6821,
"step": 271
},
{
"epoch": 0.05649013499480789,
"grad_norm": 0.48842960596084595,
"learning_rate": 8e-05,
"loss": 1.6095,
"step": 272
},
{
"epoch": 0.05669781931464175,
"grad_norm": 0.5582982301712036,
"learning_rate": 8e-05,
"loss": 1.7071,
"step": 273
},
{
"epoch": 0.0569055036344756,
"grad_norm": 0.4931994378566742,
"learning_rate": 8e-05,
"loss": 1.6754,
"step": 274
},
{
"epoch": 0.05711318795430945,
"grad_norm": 0.5403690934181213,
"learning_rate": 8e-05,
"loss": 1.6986,
"step": 275
},
{
"epoch": 0.0573208722741433,
"grad_norm": 0.5050585269927979,
"learning_rate": 8e-05,
"loss": 1.7073,
"step": 276
},
{
"epoch": 0.057528556593977154,
"grad_norm": 0.5011170506477356,
"learning_rate": 8e-05,
"loss": 1.6561,
"step": 277
},
{
"epoch": 0.05773624091381101,
"grad_norm": 0.5033249258995056,
"learning_rate": 8e-05,
"loss": 1.7003,
"step": 278
},
{
"epoch": 0.05794392523364486,
"grad_norm": 0.5251483917236328,
"learning_rate": 8e-05,
"loss": 1.6675,
"step": 279
},
{
"epoch": 0.058151609553478714,
"grad_norm": 0.5164880752563477,
"learning_rate": 8e-05,
"loss": 1.7132,
"step": 280
},
{
"epoch": 0.05835929387331257,
"grad_norm": 0.4889441728591919,
"learning_rate": 8e-05,
"loss": 1.7148,
"step": 281
},
{
"epoch": 0.058566978193146414,
"grad_norm": 0.49667251110076904,
"learning_rate": 8e-05,
"loss": 1.7136,
"step": 282
},
{
"epoch": 0.05877466251298027,
"grad_norm": 0.5121577382087708,
"learning_rate": 8e-05,
"loss": 1.764,
"step": 283
},
{
"epoch": 0.05898234683281412,
"grad_norm": 0.48562902212142944,
"learning_rate": 8e-05,
"loss": 1.7124,
"step": 284
},
{
"epoch": 0.059190031152647975,
"grad_norm": 0.4913182556629181,
"learning_rate": 8e-05,
"loss": 1.6498,
"step": 285
},
{
"epoch": 0.05939771547248183,
"grad_norm": 0.5064699053764343,
"learning_rate": 8e-05,
"loss": 1.598,
"step": 286
},
{
"epoch": 0.05960539979231568,
"grad_norm": 0.553032398223877,
"learning_rate": 8e-05,
"loss": 1.705,
"step": 287
},
{
"epoch": 0.059813084112149535,
"grad_norm": 0.5344775319099426,
"learning_rate": 8e-05,
"loss": 1.7229,
"step": 288
},
{
"epoch": 0.06002076843198339,
"grad_norm": 0.5126497149467468,
"learning_rate": 8e-05,
"loss": 1.7099,
"step": 289
},
{
"epoch": 0.060228452751817235,
"grad_norm": 0.4790019690990448,
"learning_rate": 8e-05,
"loss": 1.6468,
"step": 290
},
{
"epoch": 0.06043613707165109,
"grad_norm": 0.5178213715553284,
"learning_rate": 8e-05,
"loss": 1.729,
"step": 291
},
{
"epoch": 0.06064382139148494,
"grad_norm": 0.4989386796951294,
"learning_rate": 8e-05,
"loss": 1.7297,
"step": 292
},
{
"epoch": 0.060851505711318796,
"grad_norm": 0.49952176213264465,
"learning_rate": 8e-05,
"loss": 1.7531,
"step": 293
},
{
"epoch": 0.06105919003115265,
"grad_norm": 0.5082775354385376,
"learning_rate": 8e-05,
"loss": 1.7071,
"step": 294
},
{
"epoch": 0.0612668743509865,
"grad_norm": 0.4975671172142029,
"learning_rate": 8e-05,
"loss": 1.6984,
"step": 295
},
{
"epoch": 0.061474558670820356,
"grad_norm": 0.5003031492233276,
"learning_rate": 8e-05,
"loss": 1.657,
"step": 296
},
{
"epoch": 0.0616822429906542,
"grad_norm": 0.524743378162384,
"learning_rate": 8e-05,
"loss": 1.6763,
"step": 297
},
{
"epoch": 0.061889927310488056,
"grad_norm": 0.5136753916740417,
"learning_rate": 8e-05,
"loss": 1.6642,
"step": 298
},
{
"epoch": 0.06209761163032191,
"grad_norm": 0.55498868227005,
"learning_rate": 8e-05,
"loss": 1.7189,
"step": 299
},
{
"epoch": 0.06230529595015576,
"grad_norm": 0.468551367521286,
"learning_rate": 8e-05,
"loss": 1.6853,
"step": 300
},
{
"epoch": 0.06251298026998961,
"grad_norm": 0.5169484615325928,
"learning_rate": 8e-05,
"loss": 1.6934,
"step": 301
},
{
"epoch": 0.06272066458982346,
"grad_norm": 0.4962407052516937,
"learning_rate": 8e-05,
"loss": 1.67,
"step": 302
},
{
"epoch": 0.06292834890965732,
"grad_norm": 0.5252827405929565,
"learning_rate": 8e-05,
"loss": 1.7051,
"step": 303
},
{
"epoch": 0.06313603322949117,
"grad_norm": 0.49051743745803833,
"learning_rate": 8e-05,
"loss": 1.6435,
"step": 304
},
{
"epoch": 0.06334371754932502,
"grad_norm": 0.49855688214302063,
"learning_rate": 8e-05,
"loss": 1.6876,
"step": 305
},
{
"epoch": 0.06355140186915888,
"grad_norm": 0.5097089409828186,
"learning_rate": 8e-05,
"loss": 1.7275,
"step": 306
},
{
"epoch": 0.06375908618899273,
"grad_norm": 0.4844972491264343,
"learning_rate": 8e-05,
"loss": 1.596,
"step": 307
},
{
"epoch": 0.06396677050882658,
"grad_norm": 0.4773125946521759,
"learning_rate": 8e-05,
"loss": 1.6302,
"step": 308
},
{
"epoch": 0.06417445482866044,
"grad_norm": 0.4860425293445587,
"learning_rate": 8e-05,
"loss": 1.6042,
"step": 309
},
{
"epoch": 0.06438213914849429,
"grad_norm": 0.49147120118141174,
"learning_rate": 8e-05,
"loss": 1.6496,
"step": 310
},
{
"epoch": 0.06458982346832814,
"grad_norm": 0.4904959797859192,
"learning_rate": 8e-05,
"loss": 1.6267,
"step": 311
},
{
"epoch": 0.064797507788162,
"grad_norm": 0.5017173886299133,
"learning_rate": 8e-05,
"loss": 1.6827,
"step": 312
},
{
"epoch": 0.06500519210799585,
"grad_norm": 0.5119773149490356,
"learning_rate": 8e-05,
"loss": 1.7423,
"step": 313
},
{
"epoch": 0.0652128764278297,
"grad_norm": 0.5187619924545288,
"learning_rate": 8e-05,
"loss": 1.6919,
"step": 314
},
{
"epoch": 0.06542056074766354,
"grad_norm": 0.499828577041626,
"learning_rate": 8e-05,
"loss": 1.6802,
"step": 315
},
{
"epoch": 0.0656282450674974,
"grad_norm": 0.5605705976486206,
"learning_rate": 8e-05,
"loss": 1.7527,
"step": 316
},
{
"epoch": 0.06583592938733125,
"grad_norm": 0.477637380361557,
"learning_rate": 8e-05,
"loss": 1.6605,
"step": 317
},
{
"epoch": 0.0660436137071651,
"grad_norm": 0.5356731414794922,
"learning_rate": 8e-05,
"loss": 1.6564,
"step": 318
},
{
"epoch": 0.06625129802699896,
"grad_norm": 0.4828336834907532,
"learning_rate": 8e-05,
"loss": 1.5889,
"step": 319
},
{
"epoch": 0.06645898234683281,
"grad_norm": 0.5663031339645386,
"learning_rate": 8e-05,
"loss": 1.7158,
"step": 320
},
{
"epoch": 0.06666666666666667,
"grad_norm": 0.5123764872550964,
"learning_rate": 8e-05,
"loss": 1.6212,
"step": 321
},
{
"epoch": 0.06687435098650052,
"grad_norm": 0.5193995833396912,
"learning_rate": 8e-05,
"loss": 1.6672,
"step": 322
},
{
"epoch": 0.06708203530633437,
"grad_norm": 0.612371563911438,
"learning_rate": 8e-05,
"loss": 1.7297,
"step": 323
},
{
"epoch": 0.06728971962616823,
"grad_norm": 0.5306211709976196,
"learning_rate": 8e-05,
"loss": 1.5693,
"step": 324
},
{
"epoch": 0.06749740394600208,
"grad_norm": 0.5609772801399231,
"learning_rate": 8e-05,
"loss": 1.6011,
"step": 325
},
{
"epoch": 0.06770508826583593,
"grad_norm": 0.5140113234519958,
"learning_rate": 8e-05,
"loss": 1.6519,
"step": 326
},
{
"epoch": 0.06791277258566979,
"grad_norm": 0.5529564023017883,
"learning_rate": 8e-05,
"loss": 1.6217,
"step": 327
},
{
"epoch": 0.06812045690550364,
"grad_norm": 0.48979297280311584,
"learning_rate": 8e-05,
"loss": 1.6044,
"step": 328
},
{
"epoch": 0.0683281412253375,
"grad_norm": 0.5272225141525269,
"learning_rate": 8e-05,
"loss": 1.7409,
"step": 329
},
{
"epoch": 0.06853582554517133,
"grad_norm": 0.5034517645835876,
"learning_rate": 8e-05,
"loss": 1.6181,
"step": 330
},
{
"epoch": 0.06874350986500519,
"grad_norm": 0.4756111800670624,
"learning_rate": 8e-05,
"loss": 1.6358,
"step": 331
},
{
"epoch": 0.06895119418483904,
"grad_norm": 0.5277131199836731,
"learning_rate": 8e-05,
"loss": 1.7289,
"step": 332
},
{
"epoch": 0.0691588785046729,
"grad_norm": 0.4864356815814972,
"learning_rate": 8e-05,
"loss": 1.6573,
"step": 333
},
{
"epoch": 0.06936656282450675,
"grad_norm": 0.49576079845428467,
"learning_rate": 8e-05,
"loss": 1.7073,
"step": 334
},
{
"epoch": 0.0695742471443406,
"grad_norm": 0.4868834614753723,
"learning_rate": 8e-05,
"loss": 1.6662,
"step": 335
},
{
"epoch": 0.06978193146417445,
"grad_norm": 0.5043144822120667,
"learning_rate": 8e-05,
"loss": 1.6672,
"step": 336
},
{
"epoch": 0.06998961578400831,
"grad_norm": 0.49576571583747864,
"learning_rate": 8e-05,
"loss": 1.7137,
"step": 337
},
{
"epoch": 0.07019730010384216,
"grad_norm": 0.4854160249233246,
"learning_rate": 8e-05,
"loss": 1.6409,
"step": 338
},
{
"epoch": 0.07040498442367601,
"grad_norm": 0.48695647716522217,
"learning_rate": 8e-05,
"loss": 1.6114,
"step": 339
},
{
"epoch": 0.07061266874350987,
"grad_norm": 0.4941887855529785,
"learning_rate": 8e-05,
"loss": 1.6843,
"step": 340
},
{
"epoch": 0.07082035306334372,
"grad_norm": 0.49328118562698364,
"learning_rate": 8e-05,
"loss": 1.6837,
"step": 341
},
{
"epoch": 0.07102803738317758,
"grad_norm": 0.4951527714729309,
"learning_rate": 8e-05,
"loss": 1.6745,
"step": 342
},
{
"epoch": 0.07123572170301143,
"grad_norm": 0.49303266406059265,
"learning_rate": 8e-05,
"loss": 1.6726,
"step": 343
},
{
"epoch": 0.07144340602284527,
"grad_norm": 0.48809322714805603,
"learning_rate": 8e-05,
"loss": 1.6716,
"step": 344
},
{
"epoch": 0.07165109034267912,
"grad_norm": 0.5064594745635986,
"learning_rate": 8e-05,
"loss": 1.7361,
"step": 345
},
{
"epoch": 0.07185877466251298,
"grad_norm": 0.5080369710922241,
"learning_rate": 8e-05,
"loss": 1.6929,
"step": 346
},
{
"epoch": 0.07206645898234683,
"grad_norm": 0.4928899109363556,
"learning_rate": 8e-05,
"loss": 1.7132,
"step": 347
},
{
"epoch": 0.07227414330218068,
"grad_norm": 0.47712448239326477,
"learning_rate": 8e-05,
"loss": 1.6334,
"step": 348
},
{
"epoch": 0.07248182762201454,
"grad_norm": 0.513994038105011,
"learning_rate": 8e-05,
"loss": 1.699,
"step": 349
},
{
"epoch": 0.07268951194184839,
"grad_norm": 0.5016348958015442,
"learning_rate": 8e-05,
"loss": 1.634,
"step": 350
},
{
"epoch": 0.07289719626168224,
"grad_norm": 0.5038484334945679,
"learning_rate": 8e-05,
"loss": 1.6643,
"step": 351
},
{
"epoch": 0.0731048805815161,
"grad_norm": 0.4928259551525116,
"learning_rate": 8e-05,
"loss": 1.6646,
"step": 352
},
{
"epoch": 0.07331256490134995,
"grad_norm": 0.5103161334991455,
"learning_rate": 8e-05,
"loss": 1.6494,
"step": 353
},
{
"epoch": 0.0735202492211838,
"grad_norm": 0.47103309631347656,
"learning_rate": 8e-05,
"loss": 1.6299,
"step": 354
},
{
"epoch": 0.07372793354101766,
"grad_norm": 0.5037011504173279,
"learning_rate": 8e-05,
"loss": 1.6283,
"step": 355
},
{
"epoch": 0.07393561786085151,
"grad_norm": 0.5026373267173767,
"learning_rate": 8e-05,
"loss": 1.7157,
"step": 356
},
{
"epoch": 0.07414330218068536,
"grad_norm": 0.5053650140762329,
"learning_rate": 8e-05,
"loss": 1.6587,
"step": 357
},
{
"epoch": 0.07435098650051922,
"grad_norm": 0.496429443359375,
"learning_rate": 8e-05,
"loss": 1.7041,
"step": 358
},
{
"epoch": 0.07455867082035306,
"grad_norm": 0.47459688782691956,
"learning_rate": 8e-05,
"loss": 1.6136,
"step": 359
},
{
"epoch": 0.07476635514018691,
"grad_norm": 0.5140553116798401,
"learning_rate": 8e-05,
"loss": 1.6625,
"step": 360
},
{
"epoch": 0.07497403946002076,
"grad_norm": 0.515044093132019,
"learning_rate": 8e-05,
"loss": 1.711,
"step": 361
},
{
"epoch": 0.07518172377985462,
"grad_norm": 0.5119950771331787,
"learning_rate": 8e-05,
"loss": 1.6852,
"step": 362
},
{
"epoch": 0.07538940809968847,
"grad_norm": 0.5062432289123535,
"learning_rate": 8e-05,
"loss": 1.6892,
"step": 363
},
{
"epoch": 0.07559709241952232,
"grad_norm": 0.5517569780349731,
"learning_rate": 8e-05,
"loss": 1.6883,
"step": 364
},
{
"epoch": 0.07580477673935618,
"grad_norm": 0.48322999477386475,
"learning_rate": 8e-05,
"loss": 1.6383,
"step": 365
},
{
"epoch": 0.07601246105919003,
"grad_norm": 0.47941911220550537,
"learning_rate": 8e-05,
"loss": 1.6032,
"step": 366
},
{
"epoch": 0.07622014537902388,
"grad_norm": 0.4892639219760895,
"learning_rate": 8e-05,
"loss": 1.6862,
"step": 367
},
{
"epoch": 0.07642782969885774,
"grad_norm": 0.48457542061805725,
"learning_rate": 8e-05,
"loss": 1.6253,
"step": 368
},
{
"epoch": 0.07663551401869159,
"grad_norm": 0.48371532559394836,
"learning_rate": 8e-05,
"loss": 1.7072,
"step": 369
},
{
"epoch": 0.07684319833852545,
"grad_norm": 0.5162315964698792,
"learning_rate": 8e-05,
"loss": 1.6952,
"step": 370
},
{
"epoch": 0.0770508826583593,
"grad_norm": 0.494991660118103,
"learning_rate": 8e-05,
"loss": 1.7292,
"step": 371
},
{
"epoch": 0.07725856697819315,
"grad_norm": 0.510187029838562,
"learning_rate": 8e-05,
"loss": 1.6835,
"step": 372
},
{
"epoch": 0.077466251298027,
"grad_norm": 0.47694623470306396,
"learning_rate": 8e-05,
"loss": 1.6809,
"step": 373
},
{
"epoch": 0.07767393561786085,
"grad_norm": 0.4950973093509674,
"learning_rate": 8e-05,
"loss": 1.6691,
"step": 374
},
{
"epoch": 0.0778816199376947,
"grad_norm": 0.4885307848453522,
"learning_rate": 8e-05,
"loss": 1.6682,
"step": 375
},
{
"epoch": 0.07808930425752855,
"grad_norm": 0.5185424089431763,
"learning_rate": 8e-05,
"loss": 1.698,
"step": 376
},
{
"epoch": 0.0782969885773624,
"grad_norm": 0.48990127444267273,
"learning_rate": 8e-05,
"loss": 1.6877,
"step": 377
},
{
"epoch": 0.07850467289719626,
"grad_norm": 0.5352708101272583,
"learning_rate": 8e-05,
"loss": 1.6141,
"step": 378
},
{
"epoch": 0.07871235721703011,
"grad_norm": 0.48971933126449585,
"learning_rate": 8e-05,
"loss": 1.6644,
"step": 379
},
{
"epoch": 0.07892004153686397,
"grad_norm": 0.5039674639701843,
"learning_rate": 8e-05,
"loss": 1.6772,
"step": 380
},
{
"epoch": 0.07912772585669782,
"grad_norm": 0.5058568120002747,
"learning_rate": 8e-05,
"loss": 1.6109,
"step": 381
},
{
"epoch": 0.07933541017653167,
"grad_norm": 0.5172670483589172,
"learning_rate": 8e-05,
"loss": 1.6229,
"step": 382
},
{
"epoch": 0.07954309449636553,
"grad_norm": 0.4776594340801239,
"learning_rate": 8e-05,
"loss": 1.6517,
"step": 383
},
{
"epoch": 0.07975077881619938,
"grad_norm": 0.5346738696098328,
"learning_rate": 8e-05,
"loss": 1.6985,
"step": 384
},
{
"epoch": 0.07995846313603323,
"grad_norm": 0.5013498067855835,
"learning_rate": 8e-05,
"loss": 1.6297,
"step": 385
},
{
"epoch": 0.08016614745586709,
"grad_norm": 0.6052170991897583,
"learning_rate": 8e-05,
"loss": 1.6397,
"step": 386
},
{
"epoch": 0.08037383177570094,
"grad_norm": 0.4749467074871063,
"learning_rate": 8e-05,
"loss": 1.6263,
"step": 387
},
{
"epoch": 0.08058151609553478,
"grad_norm": 0.4913139045238495,
"learning_rate": 8e-05,
"loss": 1.6091,
"step": 388
},
{
"epoch": 0.08078920041536863,
"grad_norm": 0.5050566792488098,
"learning_rate": 8e-05,
"loss": 1.6801,
"step": 389
},
{
"epoch": 0.08099688473520249,
"grad_norm": 0.4727949798107147,
"learning_rate": 8e-05,
"loss": 1.6141,
"step": 390
},
{
"epoch": 0.08120456905503634,
"grad_norm": 0.4729914367198944,
"learning_rate": 8e-05,
"loss": 1.5569,
"step": 391
},
{
"epoch": 0.0814122533748702,
"grad_norm": 0.4925561249256134,
"learning_rate": 8e-05,
"loss": 1.7113,
"step": 392
},
{
"epoch": 0.08161993769470405,
"grad_norm": 0.49586695432662964,
"learning_rate": 8e-05,
"loss": 1.6699,
"step": 393
},
{
"epoch": 0.0818276220145379,
"grad_norm": 0.5150527954101562,
"learning_rate": 8e-05,
"loss": 1.6564,
"step": 394
},
{
"epoch": 0.08203530633437175,
"grad_norm": 0.5019596219062805,
"learning_rate": 8e-05,
"loss": 1.6275,
"step": 395
},
{
"epoch": 0.08224299065420561,
"grad_norm": 0.4876788258552551,
"learning_rate": 8e-05,
"loss": 1.5836,
"step": 396
},
{
"epoch": 0.08245067497403946,
"grad_norm": 0.49811601638793945,
"learning_rate": 8e-05,
"loss": 1.6608,
"step": 397
},
{
"epoch": 0.08265835929387332,
"grad_norm": 0.5187938809394836,
"learning_rate": 8e-05,
"loss": 1.6999,
"step": 398
},
{
"epoch": 0.08286604361370717,
"grad_norm": 0.4808737635612488,
"learning_rate": 8e-05,
"loss": 1.6004,
"step": 399
},
{
"epoch": 0.08307372793354102,
"grad_norm": 0.515282392501831,
"learning_rate": 8e-05,
"loss": 1.7455,
"step": 400
},
{
"epoch": 0.08328141225337488,
"grad_norm": 0.5063252449035645,
"learning_rate": 8e-05,
"loss": 1.6955,
"step": 401
},
{
"epoch": 0.08348909657320873,
"grad_norm": 0.5026871562004089,
"learning_rate": 8e-05,
"loss": 1.7067,
"step": 402
},
{
"epoch": 0.08369678089304257,
"grad_norm": 0.4903314709663391,
"learning_rate": 8e-05,
"loss": 1.6333,
"step": 403
},
{
"epoch": 0.08390446521287642,
"grad_norm": 0.4954453110694885,
"learning_rate": 8e-05,
"loss": 1.5555,
"step": 404
},
{
"epoch": 0.08411214953271028,
"grad_norm": 0.5026134252548218,
"learning_rate": 8e-05,
"loss": 1.6969,
"step": 405
},
{
"epoch": 0.08431983385254413,
"grad_norm": 0.49587976932525635,
"learning_rate": 8e-05,
"loss": 1.651,
"step": 406
},
{
"epoch": 0.08452751817237798,
"grad_norm": 0.5106424689292908,
"learning_rate": 8e-05,
"loss": 1.7021,
"step": 407
},
{
"epoch": 0.08473520249221184,
"grad_norm": 0.5195350646972656,
"learning_rate": 8e-05,
"loss": 1.7539,
"step": 408
},
{
"epoch": 0.08494288681204569,
"grad_norm": 0.501416027545929,
"learning_rate": 8e-05,
"loss": 1.725,
"step": 409
},
{
"epoch": 0.08515057113187954,
"grad_norm": 0.48874419927597046,
"learning_rate": 8e-05,
"loss": 1.7037,
"step": 410
},
{
"epoch": 0.0853582554517134,
"grad_norm": 0.4698718190193176,
"learning_rate": 8e-05,
"loss": 1.6509,
"step": 411
},
{
"epoch": 0.08556593977154725,
"grad_norm": 0.4783203899860382,
"learning_rate": 8e-05,
"loss": 1.6171,
"step": 412
},
{
"epoch": 0.0857736240913811,
"grad_norm": 0.481812983751297,
"learning_rate": 8e-05,
"loss": 1.6676,
"step": 413
},
{
"epoch": 0.08598130841121496,
"grad_norm": 0.481001615524292,
"learning_rate": 8e-05,
"loss": 1.7201,
"step": 414
},
{
"epoch": 0.08618899273104881,
"grad_norm": 0.48555779457092285,
"learning_rate": 8e-05,
"loss": 1.6573,
"step": 415
},
{
"epoch": 0.08639667705088266,
"grad_norm": 0.49405309557914734,
"learning_rate": 8e-05,
"loss": 1.6688,
"step": 416
},
{
"epoch": 0.08660436137071652,
"grad_norm": 0.5051506757736206,
"learning_rate": 8e-05,
"loss": 1.7261,
"step": 417
},
{
"epoch": 0.08681204569055036,
"grad_norm": 0.48117661476135254,
"learning_rate": 8e-05,
"loss": 1.6696,
"step": 418
},
{
"epoch": 0.08701973001038421,
"grad_norm": 0.4655582904815674,
"learning_rate": 8e-05,
"loss": 1.5525,
"step": 419
},
{
"epoch": 0.08722741433021806,
"grad_norm": 0.4815155863761902,
"learning_rate": 8e-05,
"loss": 1.6482,
"step": 420
},
{
"epoch": 0.08743509865005192,
"grad_norm": 0.4939725697040558,
"learning_rate": 8e-05,
"loss": 1.6752,
"step": 421
},
{
"epoch": 0.08764278296988577,
"grad_norm": 0.4941619634628296,
"learning_rate": 8e-05,
"loss": 1.658,
"step": 422
},
{
"epoch": 0.08785046728971962,
"grad_norm": 0.5332744121551514,
"learning_rate": 8e-05,
"loss": 1.6118,
"step": 423
},
{
"epoch": 0.08805815160955348,
"grad_norm": 0.5089534521102905,
"learning_rate": 8e-05,
"loss": 1.6835,
"step": 424
},
{
"epoch": 0.08826583592938733,
"grad_norm": 0.5659358501434326,
"learning_rate": 8e-05,
"loss": 1.646,
"step": 425
},
{
"epoch": 0.08847352024922119,
"grad_norm": 0.49219921231269836,
"learning_rate": 8e-05,
"loss": 1.6452,
"step": 426
},
{
"epoch": 0.08868120456905504,
"grad_norm": 0.5574254989624023,
"learning_rate": 8e-05,
"loss": 1.6968,
"step": 427
},
{
"epoch": 0.08888888888888889,
"grad_norm": 0.4780307412147522,
"learning_rate": 8e-05,
"loss": 1.6332,
"step": 428
},
{
"epoch": 0.08909657320872275,
"grad_norm": 0.4689270555973053,
"learning_rate": 8e-05,
"loss": 1.6342,
"step": 429
},
{
"epoch": 0.0893042575285566,
"grad_norm": 0.5344724655151367,
"learning_rate": 8e-05,
"loss": 1.6307,
"step": 430
},
{
"epoch": 0.08951194184839045,
"grad_norm": 0.4806777536869049,
"learning_rate": 8e-05,
"loss": 1.6404,
"step": 431
},
{
"epoch": 0.08971962616822429,
"grad_norm": 0.5253303050994873,
"learning_rate": 8e-05,
"loss": 1.6228,
"step": 432
},
{
"epoch": 0.08992731048805815,
"grad_norm": 0.49155253171920776,
"learning_rate": 8e-05,
"loss": 1.6423,
"step": 433
},
{
"epoch": 0.090134994807892,
"grad_norm": 0.5021010637283325,
"learning_rate": 8e-05,
"loss": 1.6646,
"step": 434
},
{
"epoch": 0.09034267912772585,
"grad_norm": 0.5126837491989136,
"learning_rate": 8e-05,
"loss": 1.6719,
"step": 435
},
{
"epoch": 0.0905503634475597,
"grad_norm": 0.48987722396850586,
"learning_rate": 8e-05,
"loss": 1.6355,
"step": 436
},
{
"epoch": 0.09075804776739356,
"grad_norm": 0.46434321999549866,
"learning_rate": 8e-05,
"loss": 1.6236,
"step": 437
},
{
"epoch": 0.09096573208722741,
"grad_norm": 0.4966569244861603,
"learning_rate": 8e-05,
"loss": 1.7529,
"step": 438
},
{
"epoch": 0.09117341640706127,
"grad_norm": 0.5068960189819336,
"learning_rate": 8e-05,
"loss": 1.718,
"step": 439
},
{
"epoch": 0.09138110072689512,
"grad_norm": 0.46872174739837646,
"learning_rate": 8e-05,
"loss": 1.5913,
"step": 440
},
{
"epoch": 0.09158878504672897,
"grad_norm": 0.5093649625778198,
"learning_rate": 8e-05,
"loss": 1.7232,
"step": 441
},
{
"epoch": 0.09179646936656283,
"grad_norm": 0.48676973581314087,
"learning_rate": 8e-05,
"loss": 1.6004,
"step": 442
},
{
"epoch": 0.09200415368639668,
"grad_norm": 0.5160446763038635,
"learning_rate": 8e-05,
"loss": 1.7038,
"step": 443
},
{
"epoch": 0.09221183800623053,
"grad_norm": 0.4958758056163788,
"learning_rate": 8e-05,
"loss": 1.6985,
"step": 444
},
{
"epoch": 0.09241952232606439,
"grad_norm": 0.5180954337120056,
"learning_rate": 8e-05,
"loss": 1.636,
"step": 445
},
{
"epoch": 0.09262720664589824,
"grad_norm": 0.5121625661849976,
"learning_rate": 8e-05,
"loss": 1.6144,
"step": 446
},
{
"epoch": 0.09283489096573208,
"grad_norm": 0.49673452973365784,
"learning_rate": 8e-05,
"loss": 1.5977,
"step": 447
},
{
"epoch": 0.09304257528556593,
"grad_norm": 0.4997375011444092,
"learning_rate": 8e-05,
"loss": 1.686,
"step": 448
},
{
"epoch": 0.09325025960539979,
"grad_norm": 0.49643149971961975,
"learning_rate": 8e-05,
"loss": 1.6721,
"step": 449
},
{
"epoch": 0.09345794392523364,
"grad_norm": 0.4948190152645111,
"learning_rate": 8e-05,
"loss": 1.657,
"step": 450
},
{
"epoch": 0.0936656282450675,
"grad_norm": 0.4922389090061188,
"learning_rate": 8e-05,
"loss": 1.65,
"step": 451
},
{
"epoch": 0.09387331256490135,
"grad_norm": 0.5069667100906372,
"learning_rate": 8e-05,
"loss": 1.7523,
"step": 452
},
{
"epoch": 0.0940809968847352,
"grad_norm": 0.4961357116699219,
"learning_rate": 8e-05,
"loss": 1.752,
"step": 453
},
{
"epoch": 0.09428868120456906,
"grad_norm": 0.5226089358329773,
"learning_rate": 8e-05,
"loss": 1.6604,
"step": 454
},
{
"epoch": 0.09449636552440291,
"grad_norm": 0.47510433197021484,
"learning_rate": 8e-05,
"loss": 1.7006,
"step": 455
},
{
"epoch": 0.09470404984423676,
"grad_norm": 0.5109766125679016,
"learning_rate": 8e-05,
"loss": 1.7016,
"step": 456
},
{
"epoch": 0.09491173416407062,
"grad_norm": 0.4836071729660034,
"learning_rate": 8e-05,
"loss": 1.6091,
"step": 457
},
{
"epoch": 0.09511941848390447,
"grad_norm": 0.523279070854187,
"learning_rate": 8e-05,
"loss": 1.6313,
"step": 458
},
{
"epoch": 0.09532710280373832,
"grad_norm": 0.5089949369430542,
"learning_rate": 8e-05,
"loss": 1.6241,
"step": 459
},
{
"epoch": 0.09553478712357218,
"grad_norm": 0.509610652923584,
"learning_rate": 8e-05,
"loss": 1.6721,
"step": 460
},
{
"epoch": 0.09574247144340603,
"grad_norm": 0.4768628776073456,
"learning_rate": 8e-05,
"loss": 1.5789,
"step": 461
},
{
"epoch": 0.09595015576323987,
"grad_norm": 0.5115273594856262,
"learning_rate": 8e-05,
"loss": 1.7081,
"step": 462
},
{
"epoch": 0.09615784008307372,
"grad_norm": 0.4946316182613373,
"learning_rate": 8e-05,
"loss": 1.6407,
"step": 463
},
{
"epoch": 0.09636552440290758,
"grad_norm": 0.5241323113441467,
"learning_rate": 8e-05,
"loss": 1.7573,
"step": 464
},
{
"epoch": 0.09657320872274143,
"grad_norm": 0.47573548555374146,
"learning_rate": 8e-05,
"loss": 1.5695,
"step": 465
},
{
"epoch": 0.09678089304257528,
"grad_norm": 0.49498170614242554,
"learning_rate": 8e-05,
"loss": 1.6743,
"step": 466
},
{
"epoch": 0.09698857736240914,
"grad_norm": 0.48741909861564636,
"learning_rate": 8e-05,
"loss": 1.645,
"step": 467
},
{
"epoch": 0.09719626168224299,
"grad_norm": 0.498971551656723,
"learning_rate": 8e-05,
"loss": 1.6154,
"step": 468
},
{
"epoch": 0.09740394600207684,
"grad_norm": 0.5093749165534973,
"learning_rate": 8e-05,
"loss": 1.7089,
"step": 469
},
{
"epoch": 0.0976116303219107,
"grad_norm": 0.5314822793006897,
"learning_rate": 8e-05,
"loss": 1.7294,
"step": 470
},
{
"epoch": 0.09781931464174455,
"grad_norm": 0.5050390958786011,
"learning_rate": 8e-05,
"loss": 1.6589,
"step": 471
},
{
"epoch": 0.0980269989615784,
"grad_norm": 0.4838104546070099,
"learning_rate": 8e-05,
"loss": 1.6221,
"step": 472
},
{
"epoch": 0.09823468328141226,
"grad_norm": 0.5459195375442505,
"learning_rate": 8e-05,
"loss": 1.6315,
"step": 473
},
{
"epoch": 0.09844236760124611,
"grad_norm": 0.511231541633606,
"learning_rate": 8e-05,
"loss": 1.7314,
"step": 474
},
{
"epoch": 0.09865005192107996,
"grad_norm": 0.5203128457069397,
"learning_rate": 8e-05,
"loss": 1.6929,
"step": 475
},
{
"epoch": 0.0988577362409138,
"grad_norm": 0.509968101978302,
"learning_rate": 8e-05,
"loss": 1.5961,
"step": 476
},
{
"epoch": 0.09906542056074766,
"grad_norm": 0.45741814374923706,
"learning_rate": 8e-05,
"loss": 1.5686,
"step": 477
},
{
"epoch": 0.09927310488058151,
"grad_norm": 0.49833589792251587,
"learning_rate": 8e-05,
"loss": 1.6054,
"step": 478
},
{
"epoch": 0.09948078920041536,
"grad_norm": 0.4866011142730713,
"learning_rate": 8e-05,
"loss": 1.5743,
"step": 479
},
{
"epoch": 0.09968847352024922,
"grad_norm": 0.48203375935554504,
"learning_rate": 8e-05,
"loss": 1.6288,
"step": 480
},
{
"epoch": 0.09989615784008307,
"grad_norm": 0.48031631112098694,
"learning_rate": 8e-05,
"loss": 1.6257,
"step": 481
},
{
"epoch": 0.10010384215991693,
"grad_norm": 0.4775748550891876,
"learning_rate": 8e-05,
"loss": 1.582,
"step": 482
},
{
"epoch": 0.10031152647975078,
"grad_norm": 0.5151557326316833,
"learning_rate": 8e-05,
"loss": 1.6394,
"step": 483
},
{
"epoch": 0.10051921079958463,
"grad_norm": 0.47847479581832886,
"learning_rate": 8e-05,
"loss": 1.6036,
"step": 484
},
{
"epoch": 0.10072689511941849,
"grad_norm": 0.5062543153762817,
"learning_rate": 8e-05,
"loss": 1.6405,
"step": 485
},
{
"epoch": 0.10093457943925234,
"grad_norm": 0.46788135170936584,
"learning_rate": 8e-05,
"loss": 1.6106,
"step": 486
},
{
"epoch": 0.10114226375908619,
"grad_norm": 0.47770899534225464,
"learning_rate": 8e-05,
"loss": 1.6511,
"step": 487
},
{
"epoch": 0.10134994807892005,
"grad_norm": 0.5121338963508606,
"learning_rate": 8e-05,
"loss": 1.6954,
"step": 488
},
{
"epoch": 0.1015576323987539,
"grad_norm": 0.5268594026565552,
"learning_rate": 8e-05,
"loss": 1.7471,
"step": 489
},
{
"epoch": 0.10176531671858775,
"grad_norm": 0.4916820526123047,
"learning_rate": 8e-05,
"loss": 1.6428,
"step": 490
},
{
"epoch": 0.10197300103842159,
"grad_norm": 0.4892474412918091,
"learning_rate": 8e-05,
"loss": 1.6466,
"step": 491
},
{
"epoch": 0.10218068535825545,
"grad_norm": 0.48951107263565063,
"learning_rate": 8e-05,
"loss": 1.7112,
"step": 492
},
{
"epoch": 0.1023883696780893,
"grad_norm": 0.4870230555534363,
"learning_rate": 8e-05,
"loss": 1.7277,
"step": 493
},
{
"epoch": 0.10259605399792315,
"grad_norm": 0.5075536370277405,
"learning_rate": 8e-05,
"loss": 1.6364,
"step": 494
},
{
"epoch": 0.102803738317757,
"grad_norm": 0.5025159120559692,
"learning_rate": 8e-05,
"loss": 1.6866,
"step": 495
},
{
"epoch": 0.10301142263759086,
"grad_norm": 0.49492985010147095,
"learning_rate": 8e-05,
"loss": 1.6144,
"step": 496
},
{
"epoch": 0.10321910695742471,
"grad_norm": 0.48841235041618347,
"learning_rate": 8e-05,
"loss": 1.6522,
"step": 497
},
{
"epoch": 0.10342679127725857,
"grad_norm": 0.47402575612068176,
"learning_rate": 8e-05,
"loss": 1.5525,
"step": 498
},
{
"epoch": 0.10363447559709242,
"grad_norm": 0.4916023910045624,
"learning_rate": 8e-05,
"loss": 1.6744,
"step": 499
},
{
"epoch": 0.10384215991692627,
"grad_norm": 0.4725121259689331,
"learning_rate": 8e-05,
"loss": 1.5623,
"step": 500
},
{
"epoch": 0.10404984423676013,
"grad_norm": 0.4987272322177887,
"learning_rate": 8e-05,
"loss": 1.6302,
"step": 501
},
{
"epoch": 0.10425752855659398,
"grad_norm": 0.48811638355255127,
"learning_rate": 8e-05,
"loss": 1.6124,
"step": 502
},
{
"epoch": 0.10446521287642783,
"grad_norm": 0.5044394731521606,
"learning_rate": 8e-05,
"loss": 1.7075,
"step": 503
},
{
"epoch": 0.10467289719626169,
"grad_norm": 0.4859680235385895,
"learning_rate": 8e-05,
"loss": 1.6975,
"step": 504
},
{
"epoch": 0.10488058151609553,
"grad_norm": 0.528005063533783,
"learning_rate": 8e-05,
"loss": 1.6129,
"step": 505
},
{
"epoch": 0.10508826583592938,
"grad_norm": 0.5075289011001587,
"learning_rate": 8e-05,
"loss": 1.7663,
"step": 506
},
{
"epoch": 0.10529595015576323,
"grad_norm": 0.5202328562736511,
"learning_rate": 8e-05,
"loss": 1.685,
"step": 507
},
{
"epoch": 0.10550363447559709,
"grad_norm": 0.47399210929870605,
"learning_rate": 8e-05,
"loss": 1.612,
"step": 508
},
{
"epoch": 0.10571131879543094,
"grad_norm": 0.48944804072380066,
"learning_rate": 8e-05,
"loss": 1.5956,
"step": 509
},
{
"epoch": 0.1059190031152648,
"grad_norm": 0.5143656730651855,
"learning_rate": 8e-05,
"loss": 1.6964,
"step": 510
},
{
"epoch": 0.10612668743509865,
"grad_norm": 0.4810107350349426,
"learning_rate": 8e-05,
"loss": 1.6548,
"step": 511
},
{
"epoch": 0.1063343717549325,
"grad_norm": 0.48855581879615784,
"learning_rate": 8e-05,
"loss": 1.637,
"step": 512
},
{
"epoch": 0.10654205607476636,
"grad_norm": 0.477912038564682,
"learning_rate": 8e-05,
"loss": 1.6137,
"step": 513
},
{
"epoch": 0.10674974039460021,
"grad_norm": 0.49473705887794495,
"learning_rate": 8e-05,
"loss": 1.6657,
"step": 514
},
{
"epoch": 0.10695742471443406,
"grad_norm": 0.4874957501888275,
"learning_rate": 8e-05,
"loss": 1.5984,
"step": 515
},
{
"epoch": 0.10716510903426792,
"grad_norm": 0.47925207018852234,
"learning_rate": 8e-05,
"loss": 1.6142,
"step": 516
},
{
"epoch": 0.10737279335410177,
"grad_norm": 0.49199312925338745,
"learning_rate": 8e-05,
"loss": 1.6706,
"step": 517
},
{
"epoch": 0.10758047767393562,
"grad_norm": 0.4763036072254181,
"learning_rate": 8e-05,
"loss": 1.6835,
"step": 518
},
{
"epoch": 0.10778816199376948,
"grad_norm": 0.5105186700820923,
"learning_rate": 8e-05,
"loss": 1.6513,
"step": 519
},
{
"epoch": 0.10799584631360332,
"grad_norm": 0.48001575469970703,
"learning_rate": 8e-05,
"loss": 1.6549,
"step": 520
},
{
"epoch": 0.10820353063343717,
"grad_norm": 0.5005422830581665,
"learning_rate": 8e-05,
"loss": 1.6499,
"step": 521
},
{
"epoch": 0.10841121495327102,
"grad_norm": 0.4944564402103424,
"learning_rate": 8e-05,
"loss": 1.6491,
"step": 522
},
{
"epoch": 0.10861889927310488,
"grad_norm": 0.47744297981262207,
"learning_rate": 8e-05,
"loss": 1.5596,
"step": 523
},
{
"epoch": 0.10882658359293873,
"grad_norm": 0.4701041877269745,
"learning_rate": 8e-05,
"loss": 1.6377,
"step": 524
},
{
"epoch": 0.10903426791277258,
"grad_norm": 0.5091982483863831,
"learning_rate": 8e-05,
"loss": 1.7002,
"step": 525
},
{
"epoch": 0.10924195223260644,
"grad_norm": 0.4894920587539673,
"learning_rate": 8e-05,
"loss": 1.6587,
"step": 526
},
{
"epoch": 0.10944963655244029,
"grad_norm": 0.48994290828704834,
"learning_rate": 8e-05,
"loss": 1.6763,
"step": 527
},
{
"epoch": 0.10965732087227414,
"grad_norm": 0.49817031621932983,
"learning_rate": 8e-05,
"loss": 1.6896,
"step": 528
},
{
"epoch": 0.109865005192108,
"grad_norm": 0.4927424490451813,
"learning_rate": 8e-05,
"loss": 1.634,
"step": 529
},
{
"epoch": 0.11007268951194185,
"grad_norm": 0.46971508860588074,
"learning_rate": 8e-05,
"loss": 1.6402,
"step": 530
},
{
"epoch": 0.1102803738317757,
"grad_norm": 0.5013573169708252,
"learning_rate": 8e-05,
"loss": 1.6279,
"step": 531
},
{
"epoch": 0.11048805815160956,
"grad_norm": 0.494057297706604,
"learning_rate": 8e-05,
"loss": 1.6284,
"step": 532
},
{
"epoch": 0.11069574247144341,
"grad_norm": 0.49397385120391846,
"learning_rate": 8e-05,
"loss": 1.6885,
"step": 533
},
{
"epoch": 0.11090342679127727,
"grad_norm": 0.5148130059242249,
"learning_rate": 8e-05,
"loss": 1.6592,
"step": 534
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.4772680401802063,
"learning_rate": 8e-05,
"loss": 1.6081,
"step": 535
},
{
"epoch": 0.11131879543094496,
"grad_norm": 0.5171798467636108,
"learning_rate": 8e-05,
"loss": 1.7278,
"step": 536
},
{
"epoch": 0.11152647975077881,
"grad_norm": 0.47712597250938416,
"learning_rate": 8e-05,
"loss": 1.6092,
"step": 537
},
{
"epoch": 0.11173416407061267,
"grad_norm": 0.47499656677246094,
"learning_rate": 8e-05,
"loss": 1.598,
"step": 538
},
{
"epoch": 0.11194184839044652,
"grad_norm": 0.4703918397426605,
"learning_rate": 8e-05,
"loss": 1.6449,
"step": 539
},
{
"epoch": 0.11214953271028037,
"grad_norm": 0.5092900991439819,
"learning_rate": 8e-05,
"loss": 1.6808,
"step": 540
},
{
"epoch": 0.11235721703011423,
"grad_norm": 0.48537686467170715,
"learning_rate": 8e-05,
"loss": 1.6026,
"step": 541
},
{
"epoch": 0.11256490134994808,
"grad_norm": 0.49423104524612427,
"learning_rate": 8e-05,
"loss": 1.7332,
"step": 542
},
{
"epoch": 0.11277258566978193,
"grad_norm": 0.48707374930381775,
"learning_rate": 8e-05,
"loss": 1.6686,
"step": 543
},
{
"epoch": 0.11298026998961579,
"grad_norm": 0.4906916618347168,
"learning_rate": 8e-05,
"loss": 1.6518,
"step": 544
},
{
"epoch": 0.11318795430944964,
"grad_norm": 0.4747893512248993,
"learning_rate": 8e-05,
"loss": 1.6452,
"step": 545
},
{
"epoch": 0.1133956386292835,
"grad_norm": 0.4864533245563507,
"learning_rate": 8e-05,
"loss": 1.6477,
"step": 546
},
{
"epoch": 0.11360332294911735,
"grad_norm": 0.469113826751709,
"learning_rate": 8e-05,
"loss": 1.5731,
"step": 547
},
{
"epoch": 0.1138110072689512,
"grad_norm": 0.48744997382164,
"learning_rate": 8e-05,
"loss": 1.6906,
"step": 548
},
{
"epoch": 0.11401869158878504,
"grad_norm": 0.4924449920654297,
"learning_rate": 8e-05,
"loss": 1.5568,
"step": 549
},
{
"epoch": 0.1142263759086189,
"grad_norm": 0.46387964487075806,
"learning_rate": 8e-05,
"loss": 1.6574,
"step": 550
},
{
"epoch": 0.11443406022845275,
"grad_norm": 0.49000611901283264,
"learning_rate": 8e-05,
"loss": 1.6969,
"step": 551
},
{
"epoch": 0.1146417445482866,
"grad_norm": 0.5043030977249146,
"learning_rate": 8e-05,
"loss": 1.6773,
"step": 552
},
{
"epoch": 0.11484942886812045,
"grad_norm": 0.5181741118431091,
"learning_rate": 8e-05,
"loss": 1.6705,
"step": 553
},
{
"epoch": 0.11505711318795431,
"grad_norm": 0.517162561416626,
"learning_rate": 8e-05,
"loss": 1.7016,
"step": 554
},
{
"epoch": 0.11526479750778816,
"grad_norm": 0.5007038116455078,
"learning_rate": 8e-05,
"loss": 1.6708,
"step": 555
},
{
"epoch": 0.11547248182762201,
"grad_norm": 0.49101993441581726,
"learning_rate": 8e-05,
"loss": 1.6829,
"step": 556
},
{
"epoch": 0.11568016614745587,
"grad_norm": 0.47777554392814636,
"learning_rate": 8e-05,
"loss": 1.6972,
"step": 557
},
{
"epoch": 0.11588785046728972,
"grad_norm": 0.5146732330322266,
"learning_rate": 8e-05,
"loss": 1.6615,
"step": 558
},
{
"epoch": 0.11609553478712357,
"grad_norm": 0.4889799654483795,
"learning_rate": 8e-05,
"loss": 1.6389,
"step": 559
},
{
"epoch": 0.11630321910695743,
"grad_norm": 0.48216649889945984,
"learning_rate": 8e-05,
"loss": 1.7034,
"step": 560
},
{
"epoch": 0.11651090342679128,
"grad_norm": 0.466871052980423,
"learning_rate": 8e-05,
"loss": 1.5976,
"step": 561
},
{
"epoch": 0.11671858774662514,
"grad_norm": 0.4961405396461487,
"learning_rate": 8e-05,
"loss": 1.6173,
"step": 562
},
{
"epoch": 0.11692627206645899,
"grad_norm": 0.5158309936523438,
"learning_rate": 8e-05,
"loss": 1.7243,
"step": 563
},
{
"epoch": 0.11713395638629283,
"grad_norm": 0.5011987686157227,
"learning_rate": 8e-05,
"loss": 1.6861,
"step": 564
},
{
"epoch": 0.11734164070612668,
"grad_norm": 0.535325825214386,
"learning_rate": 8e-05,
"loss": 1.799,
"step": 565
},
{
"epoch": 0.11754932502596054,
"grad_norm": 0.4968489110469818,
"learning_rate": 8e-05,
"loss": 1.7096,
"step": 566
},
{
"epoch": 0.11775700934579439,
"grad_norm": 0.4858616292476654,
"learning_rate": 8e-05,
"loss": 1.6617,
"step": 567
},
{
"epoch": 0.11796469366562824,
"grad_norm": 0.49563148617744446,
"learning_rate": 8e-05,
"loss": 1.6681,
"step": 568
},
{
"epoch": 0.1181723779854621,
"grad_norm": 0.49028831720352173,
"learning_rate": 8e-05,
"loss": 1.6425,
"step": 569
},
{
"epoch": 0.11838006230529595,
"grad_norm": 0.4926828145980835,
"learning_rate": 8e-05,
"loss": 1.6134,
"step": 570
},
{
"epoch": 0.1185877466251298,
"grad_norm": 0.4846639037132263,
"learning_rate": 8e-05,
"loss": 1.6194,
"step": 571
},
{
"epoch": 0.11879543094496366,
"grad_norm": 0.4756839871406555,
"learning_rate": 8e-05,
"loss": 1.6454,
"step": 572
},
{
"epoch": 0.11900311526479751,
"grad_norm": 0.4722893536090851,
"learning_rate": 8e-05,
"loss": 1.6123,
"step": 573
},
{
"epoch": 0.11921079958463136,
"grad_norm": 0.4920216202735901,
"learning_rate": 8e-05,
"loss": 1.6916,
"step": 574
},
{
"epoch": 0.11941848390446522,
"grad_norm": 0.48332494497299194,
"learning_rate": 8e-05,
"loss": 1.6414,
"step": 575
},
{
"epoch": 0.11962616822429907,
"grad_norm": 0.5009651780128479,
"learning_rate": 8e-05,
"loss": 1.6119,
"step": 576
},
{
"epoch": 0.11983385254413292,
"grad_norm": 0.4851033389568329,
"learning_rate": 8e-05,
"loss": 1.6018,
"step": 577
},
{
"epoch": 0.12004153686396678,
"grad_norm": 0.4941191077232361,
"learning_rate": 8e-05,
"loss": 1.6982,
"step": 578
},
{
"epoch": 0.12024922118380062,
"grad_norm": 0.4699910879135132,
"learning_rate": 8e-05,
"loss": 1.6004,
"step": 579
},
{
"epoch": 0.12045690550363447,
"grad_norm": 0.47666141390800476,
"learning_rate": 8e-05,
"loss": 1.6523,
"step": 580
},
{
"epoch": 0.12066458982346832,
"grad_norm": 0.5250875949859619,
"learning_rate": 8e-05,
"loss": 1.7131,
"step": 581
},
{
"epoch": 0.12087227414330218,
"grad_norm": 0.47818005084991455,
"learning_rate": 8e-05,
"loss": 1.6228,
"step": 582
},
{
"epoch": 0.12107995846313603,
"grad_norm": 0.46727368235588074,
"learning_rate": 8e-05,
"loss": 1.5709,
"step": 583
},
{
"epoch": 0.12128764278296988,
"grad_norm": 0.4783129394054413,
"learning_rate": 8e-05,
"loss": 1.6809,
"step": 584
},
{
"epoch": 0.12149532710280374,
"grad_norm": 0.48866942524909973,
"learning_rate": 8e-05,
"loss": 1.6795,
"step": 585
},
{
"epoch": 0.12170301142263759,
"grad_norm": 0.4863775968551636,
"learning_rate": 8e-05,
"loss": 1.6499,
"step": 586
},
{
"epoch": 0.12191069574247144,
"grad_norm": 0.48488086462020874,
"learning_rate": 8e-05,
"loss": 1.6765,
"step": 587
},
{
"epoch": 0.1221183800623053,
"grad_norm": 0.5019093751907349,
"learning_rate": 8e-05,
"loss": 1.6547,
"step": 588
},
{
"epoch": 0.12232606438213915,
"grad_norm": 0.484720379114151,
"learning_rate": 8e-05,
"loss": 1.6438,
"step": 589
},
{
"epoch": 0.122533748701973,
"grad_norm": 0.48393747210502625,
"learning_rate": 8e-05,
"loss": 1.6595,
"step": 590
},
{
"epoch": 0.12274143302180686,
"grad_norm": 0.5197237133979797,
"learning_rate": 8e-05,
"loss": 1.6346,
"step": 591
},
{
"epoch": 0.12294911734164071,
"grad_norm": 0.4648591876029968,
"learning_rate": 8e-05,
"loss": 1.6076,
"step": 592
},
{
"epoch": 0.12315680166147455,
"grad_norm": 0.4920279085636139,
"learning_rate": 8e-05,
"loss": 1.6787,
"step": 593
},
{
"epoch": 0.1233644859813084,
"grad_norm": 0.48412156105041504,
"learning_rate": 8e-05,
"loss": 1.6896,
"step": 594
},
{
"epoch": 0.12357217030114226,
"grad_norm": 0.4906349778175354,
"learning_rate": 8e-05,
"loss": 1.6746,
"step": 595
},
{
"epoch": 0.12377985462097611,
"grad_norm": 0.49094000458717346,
"learning_rate": 8e-05,
"loss": 1.6618,
"step": 596
},
{
"epoch": 0.12398753894080997,
"grad_norm": 0.48553135991096497,
"learning_rate": 8e-05,
"loss": 1.7193,
"step": 597
},
{
"epoch": 0.12419522326064382,
"grad_norm": 0.48377707600593567,
"learning_rate": 8e-05,
"loss": 1.6537,
"step": 598
},
{
"epoch": 0.12440290758047767,
"grad_norm": 0.47426944971084595,
"learning_rate": 8e-05,
"loss": 1.6248,
"step": 599
},
{
"epoch": 0.12461059190031153,
"grad_norm": 0.4826415777206421,
"learning_rate": 8e-05,
"loss": 1.6596,
"step": 600
},
{
"epoch": 0.12481827622014538,
"grad_norm": 0.49226781725883484,
"learning_rate": 8e-05,
"loss": 1.7157,
"step": 601
},
{
"epoch": 0.12502596053997922,
"grad_norm": 0.46622025966644287,
"learning_rate": 8e-05,
"loss": 1.61,
"step": 602
},
{
"epoch": 0.1252336448598131,
"grad_norm": 0.4895409345626831,
"learning_rate": 8e-05,
"loss": 1.6629,
"step": 603
},
{
"epoch": 0.12544132917964693,
"grad_norm": 0.4662224352359772,
"learning_rate": 8e-05,
"loss": 1.6188,
"step": 604
},
{
"epoch": 0.1256490134994808,
"grad_norm": 0.4780591130256653,
"learning_rate": 8e-05,
"loss": 1.5856,
"step": 605
},
{
"epoch": 0.12585669781931463,
"grad_norm": 0.49765658378601074,
"learning_rate": 8e-05,
"loss": 1.613,
"step": 606
},
{
"epoch": 0.1260643821391485,
"grad_norm": 0.4997657239437103,
"learning_rate": 8e-05,
"loss": 1.6758,
"step": 607
},
{
"epoch": 0.12627206645898234,
"grad_norm": 0.48155564069747925,
"learning_rate": 8e-05,
"loss": 1.6179,
"step": 608
},
{
"epoch": 0.1264797507788162,
"grad_norm": 0.4809686541557312,
"learning_rate": 8e-05,
"loss": 1.5715,
"step": 609
},
{
"epoch": 0.12668743509865005,
"grad_norm": 0.5093618035316467,
"learning_rate": 8e-05,
"loss": 1.7242,
"step": 610
},
{
"epoch": 0.12689511941848391,
"grad_norm": 0.4809294641017914,
"learning_rate": 8e-05,
"loss": 1.6881,
"step": 611
},
{
"epoch": 0.12710280373831775,
"grad_norm": 0.4776981472969055,
"learning_rate": 8e-05,
"loss": 1.6183,
"step": 612
},
{
"epoch": 0.12731048805815162,
"grad_norm": 0.4951392412185669,
"learning_rate": 8e-05,
"loss": 1.7086,
"step": 613
},
{
"epoch": 0.12751817237798546,
"grad_norm": 0.48191651701927185,
"learning_rate": 8e-05,
"loss": 1.6385,
"step": 614
},
{
"epoch": 0.1277258566978193,
"grad_norm": 0.4886009693145752,
"learning_rate": 8e-05,
"loss": 1.618,
"step": 615
},
{
"epoch": 0.12793354101765317,
"grad_norm": 0.49545979499816895,
"learning_rate": 8e-05,
"loss": 1.6361,
"step": 616
},
{
"epoch": 0.128141225337487,
"grad_norm": 0.49346309900283813,
"learning_rate": 8e-05,
"loss": 1.693,
"step": 617
},
{
"epoch": 0.12834890965732088,
"grad_norm": 0.5284665822982788,
"learning_rate": 8e-05,
"loss": 1.6347,
"step": 618
},
{
"epoch": 0.12855659397715472,
"grad_norm": 0.5081282258033752,
"learning_rate": 8e-05,
"loss": 1.6581,
"step": 619
},
{
"epoch": 0.12876427829698858,
"grad_norm": 0.5118349194526672,
"learning_rate": 8e-05,
"loss": 1.7038,
"step": 620
},
{
"epoch": 0.12897196261682242,
"grad_norm": 0.49056780338287354,
"learning_rate": 8e-05,
"loss": 1.7102,
"step": 621
},
{
"epoch": 0.1291796469366563,
"grad_norm": 0.4869656562805176,
"learning_rate": 8e-05,
"loss": 1.619,
"step": 622
},
{
"epoch": 0.12938733125649013,
"grad_norm": 0.5157124996185303,
"learning_rate": 8e-05,
"loss": 1.6398,
"step": 623
},
{
"epoch": 0.129595015576324,
"grad_norm": 0.48705726861953735,
"learning_rate": 8e-05,
"loss": 1.6655,
"step": 624
},
{
"epoch": 0.12980269989615784,
"grad_norm": 0.46967780590057373,
"learning_rate": 8e-05,
"loss": 1.6131,
"step": 625
},
{
"epoch": 0.1300103842159917,
"grad_norm": 0.47689181566238403,
"learning_rate": 8e-05,
"loss": 1.6086,
"step": 626
},
{
"epoch": 0.13021806853582554,
"grad_norm": 0.48610159754753113,
"learning_rate": 8e-05,
"loss": 1.6463,
"step": 627
},
{
"epoch": 0.1304257528556594,
"grad_norm": 0.4769737124443054,
"learning_rate": 8e-05,
"loss": 1.57,
"step": 628
},
{
"epoch": 0.13063343717549325,
"grad_norm": 0.47366201877593994,
"learning_rate": 8e-05,
"loss": 1.6771,
"step": 629
},
{
"epoch": 0.1308411214953271,
"grad_norm": 0.5110802054405212,
"learning_rate": 8e-05,
"loss": 1.6377,
"step": 630
},
{
"epoch": 0.13104880581516096,
"grad_norm": 0.5230108499526978,
"learning_rate": 8e-05,
"loss": 1.6979,
"step": 631
},
{
"epoch": 0.1312564901349948,
"grad_norm": 0.4913478493690491,
"learning_rate": 8e-05,
"loss": 1.6687,
"step": 632
},
{
"epoch": 0.13146417445482866,
"grad_norm": 0.5109069347381592,
"learning_rate": 8e-05,
"loss": 1.6655,
"step": 633
},
{
"epoch": 0.1316718587746625,
"grad_norm": 0.5116170644760132,
"learning_rate": 8e-05,
"loss": 1.7231,
"step": 634
},
{
"epoch": 0.13187954309449637,
"grad_norm": 0.5041365623474121,
"learning_rate": 8e-05,
"loss": 1.6218,
"step": 635
},
{
"epoch": 0.1320872274143302,
"grad_norm": 0.4839491546154022,
"learning_rate": 8e-05,
"loss": 1.6518,
"step": 636
},
{
"epoch": 0.13229491173416408,
"grad_norm": 0.4792550802230835,
"learning_rate": 8e-05,
"loss": 1.6506,
"step": 637
},
{
"epoch": 0.13250259605399792,
"grad_norm": 0.48295944929122925,
"learning_rate": 8e-05,
"loss": 1.6901,
"step": 638
},
{
"epoch": 0.13271028037383178,
"grad_norm": 0.4973006844520569,
"learning_rate": 8e-05,
"loss": 1.6432,
"step": 639
},
{
"epoch": 0.13291796469366562,
"grad_norm": 0.4931349456310272,
"learning_rate": 8e-05,
"loss": 1.6903,
"step": 640
},
{
"epoch": 0.1331256490134995,
"grad_norm": 0.5359208583831787,
"learning_rate": 8e-05,
"loss": 1.6107,
"step": 641
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.5108036994934082,
"learning_rate": 8e-05,
"loss": 1.743,
"step": 642
},
{
"epoch": 0.1335410176531672,
"grad_norm": 0.53212970495224,
"learning_rate": 8e-05,
"loss": 1.6529,
"step": 643
},
{
"epoch": 0.13374870197300104,
"grad_norm": 0.4618084132671356,
"learning_rate": 8e-05,
"loss": 1.6473,
"step": 644
},
{
"epoch": 0.13395638629283488,
"grad_norm": 0.4782806932926178,
"learning_rate": 8e-05,
"loss": 1.6267,
"step": 645
},
{
"epoch": 0.13416407061266875,
"grad_norm": 0.5155082941055298,
"learning_rate": 8e-05,
"loss": 1.6114,
"step": 646
},
{
"epoch": 0.13437175493250259,
"grad_norm": 0.5027504563331604,
"learning_rate": 8e-05,
"loss": 1.6559,
"step": 647
},
{
"epoch": 0.13457943925233645,
"grad_norm": 0.5325614213943481,
"learning_rate": 8e-05,
"loss": 1.5957,
"step": 648
},
{
"epoch": 0.1347871235721703,
"grad_norm": 0.4770219922065735,
"learning_rate": 8e-05,
"loss": 1.5635,
"step": 649
},
{
"epoch": 0.13499480789200416,
"grad_norm": 0.4837128818035126,
"learning_rate": 8e-05,
"loss": 1.5886,
"step": 650
},
{
"epoch": 0.135202492211838,
"grad_norm": 0.5071947574615479,
"learning_rate": 8e-05,
"loss": 1.7164,
"step": 651
},
{
"epoch": 0.13541017653167187,
"grad_norm": 0.48948243260383606,
"learning_rate": 8e-05,
"loss": 1.6695,
"step": 652
},
{
"epoch": 0.1356178608515057,
"grad_norm": 0.49877387285232544,
"learning_rate": 8e-05,
"loss": 1.6044,
"step": 653
},
{
"epoch": 0.13582554517133957,
"grad_norm": 0.48378750681877136,
"learning_rate": 8e-05,
"loss": 1.6118,
"step": 654
},
{
"epoch": 0.1360332294911734,
"grad_norm": 0.49697884917259216,
"learning_rate": 8e-05,
"loss": 1.6245,
"step": 655
},
{
"epoch": 0.13624091381100728,
"grad_norm": 0.5250710248947144,
"learning_rate": 8e-05,
"loss": 1.5699,
"step": 656
},
{
"epoch": 0.13644859813084112,
"grad_norm": 0.49164098501205444,
"learning_rate": 8e-05,
"loss": 1.7009,
"step": 657
},
{
"epoch": 0.136656282450675,
"grad_norm": 0.51181560754776,
"learning_rate": 8e-05,
"loss": 1.6749,
"step": 658
},
{
"epoch": 0.13686396677050883,
"grad_norm": 0.4895937442779541,
"learning_rate": 8e-05,
"loss": 1.6471,
"step": 659
},
{
"epoch": 0.13707165109034267,
"grad_norm": 0.500437319278717,
"learning_rate": 8e-05,
"loss": 1.6039,
"step": 660
},
{
"epoch": 0.13727933541017653,
"grad_norm": 0.49318140745162964,
"learning_rate": 8e-05,
"loss": 1.6222,
"step": 661
},
{
"epoch": 0.13748701973001037,
"grad_norm": 0.47766220569610596,
"learning_rate": 8e-05,
"loss": 1.6307,
"step": 662
},
{
"epoch": 0.13769470404984424,
"grad_norm": 0.526156485080719,
"learning_rate": 8e-05,
"loss": 1.6572,
"step": 663
},
{
"epoch": 0.13790238836967808,
"grad_norm": 0.48735764622688293,
"learning_rate": 8e-05,
"loss": 1.6393,
"step": 664
},
{
"epoch": 0.13811007268951195,
"grad_norm": 0.49642348289489746,
"learning_rate": 8e-05,
"loss": 1.6371,
"step": 665
},
{
"epoch": 0.1383177570093458,
"grad_norm": 0.5099259614944458,
"learning_rate": 8e-05,
"loss": 1.6536,
"step": 666
},
{
"epoch": 0.13852544132917965,
"grad_norm": 0.5075072646141052,
"learning_rate": 8e-05,
"loss": 1.6389,
"step": 667
},
{
"epoch": 0.1387331256490135,
"grad_norm": 0.49564462900161743,
"learning_rate": 8e-05,
"loss": 1.6303,
"step": 668
},
{
"epoch": 0.13894080996884736,
"grad_norm": 0.468453049659729,
"learning_rate": 8e-05,
"loss": 1.6019,
"step": 669
},
{
"epoch": 0.1391484942886812,
"grad_norm": 0.4859846234321594,
"learning_rate": 8e-05,
"loss": 1.6411,
"step": 670
},
{
"epoch": 0.13935617860851507,
"grad_norm": 0.468565970659256,
"learning_rate": 8e-05,
"loss": 1.5784,
"step": 671
},
{
"epoch": 0.1395638629283489,
"grad_norm": 0.4832054674625397,
"learning_rate": 8e-05,
"loss": 1.6492,
"step": 672
},
{
"epoch": 0.13977154724818278,
"grad_norm": 0.49340352416038513,
"learning_rate": 8e-05,
"loss": 1.6563,
"step": 673
},
{
"epoch": 0.13997923156801662,
"grad_norm": 0.5209794044494629,
"learning_rate": 8e-05,
"loss": 1.7233,
"step": 674
},
{
"epoch": 0.14018691588785046,
"grad_norm": 0.48782119154930115,
"learning_rate": 8e-05,
"loss": 1.6472,
"step": 675
},
{
"epoch": 0.14039460020768432,
"grad_norm": 0.5135282874107361,
"learning_rate": 8e-05,
"loss": 1.6647,
"step": 676
},
{
"epoch": 0.14060228452751816,
"grad_norm": 0.5062872171401978,
"learning_rate": 8e-05,
"loss": 1.6414,
"step": 677
},
{
"epoch": 0.14080996884735203,
"grad_norm": 0.4905490577220917,
"learning_rate": 8e-05,
"loss": 1.5628,
"step": 678
},
{
"epoch": 0.14101765316718587,
"grad_norm": 0.4919457733631134,
"learning_rate": 8e-05,
"loss": 1.6491,
"step": 679
},
{
"epoch": 0.14122533748701974,
"grad_norm": 0.5233182907104492,
"learning_rate": 8e-05,
"loss": 1.6324,
"step": 680
},
{
"epoch": 0.14143302180685358,
"grad_norm": 0.4853593111038208,
"learning_rate": 8e-05,
"loss": 1.6134,
"step": 681
},
{
"epoch": 0.14164070612668744,
"grad_norm": 0.48515936732292175,
"learning_rate": 8e-05,
"loss": 1.5967,
"step": 682
},
{
"epoch": 0.14184839044652128,
"grad_norm": 0.5354658961296082,
"learning_rate": 8e-05,
"loss": 1.5981,
"step": 683
},
{
"epoch": 0.14205607476635515,
"grad_norm": 0.48015713691711426,
"learning_rate": 8e-05,
"loss": 1.6279,
"step": 684
},
{
"epoch": 0.142263759086189,
"grad_norm": 0.5790321230888367,
"learning_rate": 8e-05,
"loss": 1.7114,
"step": 685
},
{
"epoch": 0.14247144340602286,
"grad_norm": 0.46386808156967163,
"learning_rate": 8e-05,
"loss": 1.6201,
"step": 686
},
{
"epoch": 0.1426791277258567,
"grad_norm": 0.5704835653305054,
"learning_rate": 8e-05,
"loss": 1.6524,
"step": 687
},
{
"epoch": 0.14288681204569054,
"grad_norm": 0.4924614727497101,
"learning_rate": 8e-05,
"loss": 1.7339,
"step": 688
},
{
"epoch": 0.1430944963655244,
"grad_norm": 0.5158697962760925,
"learning_rate": 8e-05,
"loss": 1.6294,
"step": 689
},
{
"epoch": 0.14330218068535824,
"grad_norm": 0.5031910538673401,
"learning_rate": 8e-05,
"loss": 1.5753,
"step": 690
},
{
"epoch": 0.1435098650051921,
"grad_norm": 0.4993326961994171,
"learning_rate": 8e-05,
"loss": 1.7069,
"step": 691
},
{
"epoch": 0.14371754932502595,
"grad_norm": 0.5596215724945068,
"learning_rate": 8e-05,
"loss": 1.6887,
"step": 692
},
{
"epoch": 0.14392523364485982,
"grad_norm": 0.464817613363266,
"learning_rate": 8e-05,
"loss": 1.5974,
"step": 693
},
{
"epoch": 0.14413291796469366,
"grad_norm": 0.5313472151756287,
"learning_rate": 8e-05,
"loss": 1.6584,
"step": 694
},
{
"epoch": 0.14434060228452752,
"grad_norm": 0.470864862203598,
"learning_rate": 8e-05,
"loss": 1.5668,
"step": 695
},
{
"epoch": 0.14454828660436136,
"grad_norm": 0.4789741039276123,
"learning_rate": 8e-05,
"loss": 1.6432,
"step": 696
},
{
"epoch": 0.14475597092419523,
"grad_norm": 0.4912903606891632,
"learning_rate": 8e-05,
"loss": 1.5999,
"step": 697
},
{
"epoch": 0.14496365524402907,
"grad_norm": 0.49190813302993774,
"learning_rate": 8e-05,
"loss": 1.6045,
"step": 698
},
{
"epoch": 0.14517133956386294,
"grad_norm": 0.48753032088279724,
"learning_rate": 8e-05,
"loss": 1.6548,
"step": 699
},
{
"epoch": 0.14537902388369678,
"grad_norm": 0.47778916358947754,
"learning_rate": 8e-05,
"loss": 1.5859,
"step": 700
},
{
"epoch": 0.14558670820353065,
"grad_norm": 0.48212990164756775,
"learning_rate": 8e-05,
"loss": 1.6022,
"step": 701
},
{
"epoch": 0.14579439252336449,
"grad_norm": 0.5078028440475464,
"learning_rate": 8e-05,
"loss": 1.7241,
"step": 702
},
{
"epoch": 0.14600207684319833,
"grad_norm": 0.49921220541000366,
"learning_rate": 8e-05,
"loss": 1.7156,
"step": 703
},
{
"epoch": 0.1462097611630322,
"grad_norm": 0.5023846626281738,
"learning_rate": 8e-05,
"loss": 1.71,
"step": 704
},
{
"epoch": 0.14641744548286603,
"grad_norm": 0.4875635802745819,
"learning_rate": 8e-05,
"loss": 1.6838,
"step": 705
},
{
"epoch": 0.1466251298026999,
"grad_norm": 0.477684885263443,
"learning_rate": 8e-05,
"loss": 1.6201,
"step": 706
},
{
"epoch": 0.14683281412253374,
"grad_norm": 0.4834393858909607,
"learning_rate": 8e-05,
"loss": 1.6228,
"step": 707
},
{
"epoch": 0.1470404984423676,
"grad_norm": 0.4797709286212921,
"learning_rate": 8e-05,
"loss": 1.589,
"step": 708
},
{
"epoch": 0.14724818276220145,
"grad_norm": 0.4814303517341614,
"learning_rate": 8e-05,
"loss": 1.6916,
"step": 709
},
{
"epoch": 0.1474558670820353,
"grad_norm": 0.48855525255203247,
"learning_rate": 8e-05,
"loss": 1.6431,
"step": 710
},
{
"epoch": 0.14766355140186915,
"grad_norm": 0.4643092453479767,
"learning_rate": 8e-05,
"loss": 1.6707,
"step": 711
},
{
"epoch": 0.14787123572170302,
"grad_norm": 0.4828472435474396,
"learning_rate": 8e-05,
"loss": 1.6574,
"step": 712
},
{
"epoch": 0.14807892004153686,
"grad_norm": 0.5036082863807678,
"learning_rate": 8e-05,
"loss": 1.7352,
"step": 713
},
{
"epoch": 0.14828660436137073,
"grad_norm": 0.49272510409355164,
"learning_rate": 8e-05,
"loss": 1.676,
"step": 714
},
{
"epoch": 0.14849428868120457,
"grad_norm": 0.4979461431503296,
"learning_rate": 8e-05,
"loss": 1.6097,
"step": 715
},
{
"epoch": 0.14870197300103843,
"grad_norm": 0.4923213720321655,
"learning_rate": 8e-05,
"loss": 1.5899,
"step": 716
},
{
"epoch": 0.14890965732087227,
"grad_norm": 0.48915570974349976,
"learning_rate": 8e-05,
"loss": 1.5901,
"step": 717
},
{
"epoch": 0.1491173416407061,
"grad_norm": 0.4918684661388397,
"learning_rate": 8e-05,
"loss": 1.6548,
"step": 718
},
{
"epoch": 0.14932502596053998,
"grad_norm": 0.47110480070114136,
"learning_rate": 8e-05,
"loss": 1.6423,
"step": 719
},
{
"epoch": 0.14953271028037382,
"grad_norm": 0.5042193531990051,
"learning_rate": 8e-05,
"loss": 1.6419,
"step": 720
},
{
"epoch": 0.1497403946002077,
"grad_norm": 0.4912029504776001,
"learning_rate": 8e-05,
"loss": 1.6902,
"step": 721
},
{
"epoch": 0.14994807892004153,
"grad_norm": 0.4772588312625885,
"learning_rate": 8e-05,
"loss": 1.572,
"step": 722
},
{
"epoch": 0.1501557632398754,
"grad_norm": 0.4800530970096588,
"learning_rate": 8e-05,
"loss": 1.6327,
"step": 723
},
{
"epoch": 0.15036344755970923,
"grad_norm": 0.5064852237701416,
"learning_rate": 8e-05,
"loss": 1.6544,
"step": 724
},
{
"epoch": 0.1505711318795431,
"grad_norm": 0.4814448654651642,
"learning_rate": 8e-05,
"loss": 1.6141,
"step": 725
},
{
"epoch": 0.15077881619937694,
"grad_norm": 0.497315376996994,
"learning_rate": 8e-05,
"loss": 1.7214,
"step": 726
},
{
"epoch": 0.1509865005192108,
"grad_norm": 0.4816543459892273,
"learning_rate": 8e-05,
"loss": 1.6629,
"step": 727
},
{
"epoch": 0.15119418483904465,
"grad_norm": 0.4759445786476135,
"learning_rate": 8e-05,
"loss": 1.5397,
"step": 728
},
{
"epoch": 0.15140186915887852,
"grad_norm": 0.4926449954509735,
"learning_rate": 8e-05,
"loss": 1.6486,
"step": 729
},
{
"epoch": 0.15160955347871236,
"grad_norm": 0.4702441990375519,
"learning_rate": 8e-05,
"loss": 1.5585,
"step": 730
},
{
"epoch": 0.15181723779854622,
"grad_norm": 0.48316627740859985,
"learning_rate": 8e-05,
"loss": 1.6167,
"step": 731
},
{
"epoch": 0.15202492211838006,
"grad_norm": 0.5125194191932678,
"learning_rate": 8e-05,
"loss": 1.6876,
"step": 732
},
{
"epoch": 0.1522326064382139,
"grad_norm": 0.4879029393196106,
"learning_rate": 8e-05,
"loss": 1.6086,
"step": 733
},
{
"epoch": 0.15244029075804777,
"grad_norm": 0.48626405000686646,
"learning_rate": 8e-05,
"loss": 1.5933,
"step": 734
},
{
"epoch": 0.1526479750778816,
"grad_norm": 0.504183828830719,
"learning_rate": 8e-05,
"loss": 1.6104,
"step": 735
},
{
"epoch": 0.15285565939771548,
"grad_norm": 0.4829785227775574,
"learning_rate": 8e-05,
"loss": 1.6874,
"step": 736
},
{
"epoch": 0.15306334371754932,
"grad_norm": 0.5468804240226746,
"learning_rate": 8e-05,
"loss": 1.7613,
"step": 737
},
{
"epoch": 0.15327102803738318,
"grad_norm": 0.5711941719055176,
"learning_rate": 8e-05,
"loss": 1.6815,
"step": 738
},
{
"epoch": 0.15347871235721702,
"grad_norm": 0.4895186722278595,
"learning_rate": 8e-05,
"loss": 1.6076,
"step": 739
},
{
"epoch": 0.1536863966770509,
"grad_norm": 0.5142952799797058,
"learning_rate": 8e-05,
"loss": 1.7081,
"step": 740
},
{
"epoch": 0.15389408099688473,
"grad_norm": 0.498079776763916,
"learning_rate": 8e-05,
"loss": 1.6074,
"step": 741
},
{
"epoch": 0.1541017653167186,
"grad_norm": 0.47368738055229187,
"learning_rate": 8e-05,
"loss": 1.6465,
"step": 742
},
{
"epoch": 0.15430944963655244,
"grad_norm": 0.5251184105873108,
"learning_rate": 8e-05,
"loss": 1.7324,
"step": 743
},
{
"epoch": 0.1545171339563863,
"grad_norm": 0.49982884526252747,
"learning_rate": 8e-05,
"loss": 1.7063,
"step": 744
},
{
"epoch": 0.15472481827622014,
"grad_norm": 0.4818509519100189,
"learning_rate": 8e-05,
"loss": 1.6719,
"step": 745
},
{
"epoch": 0.154932502596054,
"grad_norm": 0.48312175273895264,
"learning_rate": 8e-05,
"loss": 1.6511,
"step": 746
},
{
"epoch": 0.15514018691588785,
"grad_norm": 0.4745786190032959,
"learning_rate": 8e-05,
"loss": 1.6419,
"step": 747
},
{
"epoch": 0.1553478712357217,
"grad_norm": 0.5050838589668274,
"learning_rate": 8e-05,
"loss": 1.6486,
"step": 748
},
{
"epoch": 0.15555555555555556,
"grad_norm": 0.4608010947704315,
"learning_rate": 8e-05,
"loss": 1.5949,
"step": 749
},
{
"epoch": 0.1557632398753894,
"grad_norm": 0.499937504529953,
"learning_rate": 8e-05,
"loss": 1.6977,
"step": 750
},
{
"epoch": 0.15597092419522327,
"grad_norm": 0.48160862922668457,
"learning_rate": 8e-05,
"loss": 1.6612,
"step": 751
},
{
"epoch": 0.1561786085150571,
"grad_norm": 0.47431129217147827,
"learning_rate": 8e-05,
"loss": 1.5773,
"step": 752
},
{
"epoch": 0.15638629283489097,
"grad_norm": 0.48737314343452454,
"learning_rate": 8e-05,
"loss": 1.5839,
"step": 753
},
{
"epoch": 0.1565939771547248,
"grad_norm": 0.49073323607444763,
"learning_rate": 8e-05,
"loss": 1.5981,
"step": 754
},
{
"epoch": 0.15680166147455868,
"grad_norm": 0.4805930554866791,
"learning_rate": 8e-05,
"loss": 1.5883,
"step": 755
},
{
"epoch": 0.15700934579439252,
"grad_norm": 0.4898761212825775,
"learning_rate": 8e-05,
"loss": 1.6197,
"step": 756
},
{
"epoch": 0.15721703011422639,
"grad_norm": 0.48299679160118103,
"learning_rate": 8e-05,
"loss": 1.6251,
"step": 757
},
{
"epoch": 0.15742471443406023,
"grad_norm": 0.483012318611145,
"learning_rate": 8e-05,
"loss": 1.5927,
"step": 758
},
{
"epoch": 0.1576323987538941,
"grad_norm": 0.49116986989974976,
"learning_rate": 8e-05,
"loss": 1.6458,
"step": 759
},
{
"epoch": 0.15784008307372793,
"grad_norm": 0.4684026837348938,
"learning_rate": 8e-05,
"loss": 1.585,
"step": 760
},
{
"epoch": 0.1580477673935618,
"grad_norm": 0.4952329099178314,
"learning_rate": 8e-05,
"loss": 1.6516,
"step": 761
},
{
"epoch": 0.15825545171339564,
"grad_norm": 0.4795088768005371,
"learning_rate": 8e-05,
"loss": 1.6788,
"step": 762
},
{
"epoch": 0.15846313603322948,
"grad_norm": 0.481624960899353,
"learning_rate": 8e-05,
"loss": 1.577,
"step": 763
},
{
"epoch": 0.15867082035306335,
"grad_norm": 0.48516589403152466,
"learning_rate": 8e-05,
"loss": 1.6067,
"step": 764
},
{
"epoch": 0.1588785046728972,
"grad_norm": 0.5102306604385376,
"learning_rate": 8e-05,
"loss": 1.7154,
"step": 765
},
{
"epoch": 0.15908618899273105,
"grad_norm": 0.48237496614456177,
"learning_rate": 8e-05,
"loss": 1.6296,
"step": 766
},
{
"epoch": 0.1592938733125649,
"grad_norm": 0.5044882297515869,
"learning_rate": 8e-05,
"loss": 1.6751,
"step": 767
},
{
"epoch": 0.15950155763239876,
"grad_norm": 0.47297602891921997,
"learning_rate": 8e-05,
"loss": 1.6108,
"step": 768
},
{
"epoch": 0.1597092419522326,
"grad_norm": 0.5018360018730164,
"learning_rate": 8e-05,
"loss": 1.6738,
"step": 769
},
{
"epoch": 0.15991692627206647,
"grad_norm": 0.49589648842811584,
"learning_rate": 8e-05,
"loss": 1.6752,
"step": 770
},
{
"epoch": 0.1601246105919003,
"grad_norm": 0.4882931411266327,
"learning_rate": 8e-05,
"loss": 1.666,
"step": 771
},
{
"epoch": 0.16033229491173417,
"grad_norm": 0.4880520701408386,
"learning_rate": 8e-05,
"loss": 1.5941,
"step": 772
},
{
"epoch": 0.16053997923156801,
"grad_norm": 0.4973679780960083,
"learning_rate": 8e-05,
"loss": 1.6388,
"step": 773
},
{
"epoch": 0.16074766355140188,
"grad_norm": 0.494547039270401,
"learning_rate": 8e-05,
"loss": 1.6878,
"step": 774
},
{
"epoch": 0.16095534787123572,
"grad_norm": 0.4797399938106537,
"learning_rate": 8e-05,
"loss": 1.6462,
"step": 775
},
{
"epoch": 0.16116303219106956,
"grad_norm": 0.4973697364330292,
"learning_rate": 8e-05,
"loss": 1.6277,
"step": 776
},
{
"epoch": 0.16137071651090343,
"grad_norm": 0.4668048322200775,
"learning_rate": 8e-05,
"loss": 1.5988,
"step": 777
},
{
"epoch": 0.16157840083073727,
"grad_norm": 0.49718210101127625,
"learning_rate": 8e-05,
"loss": 1.6764,
"step": 778
},
{
"epoch": 0.16178608515057114,
"grad_norm": 0.48876386880874634,
"learning_rate": 8e-05,
"loss": 1.6308,
"step": 779
},
{
"epoch": 0.16199376947040497,
"grad_norm": 0.4705182909965515,
"learning_rate": 8e-05,
"loss": 1.6355,
"step": 780
},
{
"epoch": 0.16220145379023884,
"grad_norm": 0.48368439078330994,
"learning_rate": 8e-05,
"loss": 1.6519,
"step": 781
},
{
"epoch": 0.16240913811007268,
"grad_norm": 0.479199081659317,
"learning_rate": 8e-05,
"loss": 1.6046,
"step": 782
},
{
"epoch": 0.16261682242990655,
"grad_norm": 0.4942733943462372,
"learning_rate": 8e-05,
"loss": 1.6996,
"step": 783
},
{
"epoch": 0.1628245067497404,
"grad_norm": 0.5083856582641602,
"learning_rate": 8e-05,
"loss": 1.7383,
"step": 784
},
{
"epoch": 0.16303219106957426,
"grad_norm": 0.46334561705589294,
"learning_rate": 8e-05,
"loss": 1.6044,
"step": 785
},
{
"epoch": 0.1632398753894081,
"grad_norm": 0.5042205452919006,
"learning_rate": 8e-05,
"loss": 1.6773,
"step": 786
},
{
"epoch": 0.16344755970924196,
"grad_norm": 0.4855109453201294,
"learning_rate": 8e-05,
"loss": 1.6655,
"step": 787
},
{
"epoch": 0.1636552440290758,
"grad_norm": 0.4775260090827942,
"learning_rate": 8e-05,
"loss": 1.5763,
"step": 788
},
{
"epoch": 0.16386292834890967,
"grad_norm": 0.49258020520210266,
"learning_rate": 8e-05,
"loss": 1.6628,
"step": 789
},
{
"epoch": 0.1640706126687435,
"grad_norm": 0.4889313280582428,
"learning_rate": 8e-05,
"loss": 1.6056,
"step": 790
},
{
"epoch": 0.16427829698857735,
"grad_norm": 0.49056270718574524,
"learning_rate": 8e-05,
"loss": 1.6066,
"step": 791
},
{
"epoch": 0.16448598130841122,
"grad_norm": 0.47176501154899597,
"learning_rate": 8e-05,
"loss": 1.6171,
"step": 792
},
{
"epoch": 0.16469366562824506,
"grad_norm": 0.5094092488288879,
"learning_rate": 8e-05,
"loss": 1.6178,
"step": 793
},
{
"epoch": 0.16490134994807892,
"grad_norm": 0.4841829240322113,
"learning_rate": 8e-05,
"loss": 1.5962,
"step": 794
},
{
"epoch": 0.16510903426791276,
"grad_norm": 0.4807954728603363,
"learning_rate": 8e-05,
"loss": 1.5233,
"step": 795
},
{
"epoch": 0.16531671858774663,
"grad_norm": 0.4819432497024536,
"learning_rate": 8e-05,
"loss": 1.6909,
"step": 796
},
{
"epoch": 0.16552440290758047,
"grad_norm": 0.513128936290741,
"learning_rate": 8e-05,
"loss": 1.5513,
"step": 797
},
{
"epoch": 0.16573208722741434,
"grad_norm": 0.4752349555492401,
"learning_rate": 8e-05,
"loss": 1.5488,
"step": 798
},
{
"epoch": 0.16593977154724818,
"grad_norm": 0.48235711455345154,
"learning_rate": 8e-05,
"loss": 1.6281,
"step": 799
},
{
"epoch": 0.16614745586708204,
"grad_norm": 0.49711811542510986,
"learning_rate": 8e-05,
"loss": 1.6796,
"step": 800
},
{
"epoch": 0.16635514018691588,
"grad_norm": 0.5144046545028687,
"learning_rate": 8e-05,
"loss": 1.6711,
"step": 801
},
{
"epoch": 0.16656282450674975,
"grad_norm": 0.506710410118103,
"learning_rate": 8e-05,
"loss": 1.6507,
"step": 802
},
{
"epoch": 0.1667705088265836,
"grad_norm": 0.5138517618179321,
"learning_rate": 8e-05,
"loss": 1.6207,
"step": 803
},
{
"epoch": 0.16697819314641746,
"grad_norm": 0.480153352022171,
"learning_rate": 8e-05,
"loss": 1.6877,
"step": 804
},
{
"epoch": 0.1671858774662513,
"grad_norm": 0.5096755623817444,
"learning_rate": 8e-05,
"loss": 1.7058,
"step": 805
},
{
"epoch": 0.16739356178608514,
"grad_norm": 0.49383774399757385,
"learning_rate": 8e-05,
"loss": 1.6983,
"step": 806
},
{
"epoch": 0.167601246105919,
"grad_norm": 0.5019107460975647,
"learning_rate": 8e-05,
"loss": 1.6543,
"step": 807
},
{
"epoch": 0.16780893042575284,
"grad_norm": 0.48990175127983093,
"learning_rate": 8e-05,
"loss": 1.6206,
"step": 808
},
{
"epoch": 0.1680166147455867,
"grad_norm": 0.4764305055141449,
"learning_rate": 8e-05,
"loss": 1.6453,
"step": 809
},
{
"epoch": 0.16822429906542055,
"grad_norm": 0.5044610500335693,
"learning_rate": 8e-05,
"loss": 1.6739,
"step": 810
},
{
"epoch": 0.16843198338525442,
"grad_norm": 0.48134395480155945,
"learning_rate": 8e-05,
"loss": 1.6153,
"step": 811
},
{
"epoch": 0.16863966770508826,
"grad_norm": 0.4967174232006073,
"learning_rate": 8e-05,
"loss": 1.677,
"step": 812
},
{
"epoch": 0.16884735202492213,
"grad_norm": 0.47364965081214905,
"learning_rate": 8e-05,
"loss": 1.555,
"step": 813
},
{
"epoch": 0.16905503634475597,
"grad_norm": 0.49788594245910645,
"learning_rate": 8e-05,
"loss": 1.6635,
"step": 814
},
{
"epoch": 0.16926272066458983,
"grad_norm": 0.49295979738235474,
"learning_rate": 8e-05,
"loss": 1.6627,
"step": 815
},
{
"epoch": 0.16947040498442367,
"grad_norm": 0.5006287693977356,
"learning_rate": 8e-05,
"loss": 1.6183,
"step": 816
},
{
"epoch": 0.16967808930425754,
"grad_norm": 0.4745385944843292,
"learning_rate": 8e-05,
"loss": 1.585,
"step": 817
},
{
"epoch": 0.16988577362409138,
"grad_norm": 0.5007401704788208,
"learning_rate": 8e-05,
"loss": 1.6358,
"step": 818
},
{
"epoch": 0.17009345794392525,
"grad_norm": 0.4691254198551178,
"learning_rate": 8e-05,
"loss": 1.605,
"step": 819
},
{
"epoch": 0.1703011422637591,
"grad_norm": 0.48528677225112915,
"learning_rate": 8e-05,
"loss": 1.6889,
"step": 820
},
{
"epoch": 0.17050882658359293,
"grad_norm": 0.48160579800605774,
"learning_rate": 8e-05,
"loss": 1.6284,
"step": 821
},
{
"epoch": 0.1707165109034268,
"grad_norm": 0.4866841435432434,
"learning_rate": 8e-05,
"loss": 1.635,
"step": 822
},
{
"epoch": 0.17092419522326063,
"grad_norm": 0.4998115003108978,
"learning_rate": 8e-05,
"loss": 1.7034,
"step": 823
},
{
"epoch": 0.1711318795430945,
"grad_norm": 0.49009475111961365,
"learning_rate": 8e-05,
"loss": 1.6505,
"step": 824
},
{
"epoch": 0.17133956386292834,
"grad_norm": 0.47515684366226196,
"learning_rate": 8e-05,
"loss": 1.6409,
"step": 825
},
{
"epoch": 0.1715472481827622,
"grad_norm": 0.5075555443763733,
"learning_rate": 8e-05,
"loss": 1.6623,
"step": 826
},
{
"epoch": 0.17175493250259605,
"grad_norm": 0.4834977984428406,
"learning_rate": 8e-05,
"loss": 1.6343,
"step": 827
},
{
"epoch": 0.17196261682242991,
"grad_norm": 0.4810510575771332,
"learning_rate": 8e-05,
"loss": 1.5518,
"step": 828
},
{
"epoch": 0.17217030114226375,
"grad_norm": 0.5000099539756775,
"learning_rate": 8e-05,
"loss": 1.6423,
"step": 829
},
{
"epoch": 0.17237798546209762,
"grad_norm": 0.4854555130004883,
"learning_rate": 8e-05,
"loss": 1.6345,
"step": 830
},
{
"epoch": 0.17258566978193146,
"grad_norm": 0.47438621520996094,
"learning_rate": 8e-05,
"loss": 1.5127,
"step": 831
},
{
"epoch": 0.17279335410176533,
"grad_norm": 0.4803098440170288,
"learning_rate": 8e-05,
"loss": 1.5919,
"step": 832
},
{
"epoch": 0.17300103842159917,
"grad_norm": 0.48187175393104553,
"learning_rate": 8e-05,
"loss": 1.6423,
"step": 833
},
{
"epoch": 0.17320872274143304,
"grad_norm": 0.4821400046348572,
"learning_rate": 8e-05,
"loss": 1.6798,
"step": 834
},
{
"epoch": 0.17341640706126688,
"grad_norm": 0.4715557098388672,
"learning_rate": 8e-05,
"loss": 1.6162,
"step": 835
},
{
"epoch": 0.17362409138110071,
"grad_norm": 0.48765188455581665,
"learning_rate": 8e-05,
"loss": 1.6846,
"step": 836
},
{
"epoch": 0.17383177570093458,
"grad_norm": 0.5081665515899658,
"learning_rate": 8e-05,
"loss": 1.6857,
"step": 837
},
{
"epoch": 0.17403946002076842,
"grad_norm": 0.49220722913742065,
"learning_rate": 8e-05,
"loss": 1.6495,
"step": 838
},
{
"epoch": 0.1742471443406023,
"grad_norm": 0.461845338344574,
"learning_rate": 8e-05,
"loss": 1.6098,
"step": 839
},
{
"epoch": 0.17445482866043613,
"grad_norm": 0.4715192914009094,
"learning_rate": 8e-05,
"loss": 1.5672,
"step": 840
},
{
"epoch": 0.17466251298027,
"grad_norm": 0.4969162940979004,
"learning_rate": 8e-05,
"loss": 1.6727,
"step": 841
},
{
"epoch": 0.17487019730010384,
"grad_norm": 0.4748702645301819,
"learning_rate": 8e-05,
"loss": 1.6493,
"step": 842
},
{
"epoch": 0.1750778816199377,
"grad_norm": 0.47799915075302124,
"learning_rate": 8e-05,
"loss": 1.5978,
"step": 843
},
{
"epoch": 0.17528556593977154,
"grad_norm": 0.4815102815628052,
"learning_rate": 8e-05,
"loss": 1.587,
"step": 844
},
{
"epoch": 0.1754932502596054,
"grad_norm": 0.4794101417064667,
"learning_rate": 8e-05,
"loss": 1.5926,
"step": 845
},
{
"epoch": 0.17570093457943925,
"grad_norm": 0.480980783700943,
"learning_rate": 8e-05,
"loss": 1.6106,
"step": 846
},
{
"epoch": 0.17590861889927312,
"grad_norm": 0.483927458524704,
"learning_rate": 8e-05,
"loss": 1.6169,
"step": 847
},
{
"epoch": 0.17611630321910696,
"grad_norm": 0.4977688193321228,
"learning_rate": 8e-05,
"loss": 1.6233,
"step": 848
},
{
"epoch": 0.1763239875389408,
"grad_norm": 0.4976138770580292,
"learning_rate": 8e-05,
"loss": 1.7223,
"step": 849
},
{
"epoch": 0.17653167185877466,
"grad_norm": 0.4990401864051819,
"learning_rate": 8e-05,
"loss": 1.6844,
"step": 850
},
{
"epoch": 0.1767393561786085,
"grad_norm": 0.5020548105239868,
"learning_rate": 8e-05,
"loss": 1.6672,
"step": 851
},
{
"epoch": 0.17694704049844237,
"grad_norm": 0.5174390077590942,
"learning_rate": 8e-05,
"loss": 1.634,
"step": 852
},
{
"epoch": 0.1771547248182762,
"grad_norm": 0.4973217248916626,
"learning_rate": 8e-05,
"loss": 1.6551,
"step": 853
},
{
"epoch": 0.17736240913811008,
"grad_norm": 0.504899263381958,
"learning_rate": 8e-05,
"loss": 1.6016,
"step": 854
},
{
"epoch": 0.17757009345794392,
"grad_norm": 0.5085044503211975,
"learning_rate": 8e-05,
"loss": 1.6352,
"step": 855
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.4816834330558777,
"learning_rate": 8e-05,
"loss": 1.5832,
"step": 856
},
{
"epoch": 0.17798546209761162,
"grad_norm": 0.5155865550041199,
"learning_rate": 8e-05,
"loss": 1.6151,
"step": 857
},
{
"epoch": 0.1781931464174455,
"grad_norm": 0.48705244064331055,
"learning_rate": 8e-05,
"loss": 1.648,
"step": 858
},
{
"epoch": 0.17840083073727933,
"grad_norm": 0.5086380243301392,
"learning_rate": 8e-05,
"loss": 1.6601,
"step": 859
},
{
"epoch": 0.1786085150571132,
"grad_norm": 0.46647846698760986,
"learning_rate": 8e-05,
"loss": 1.6007,
"step": 860
},
{
"epoch": 0.17881619937694704,
"grad_norm": 0.4949185252189636,
"learning_rate": 8e-05,
"loss": 1.6181,
"step": 861
},
{
"epoch": 0.1790238836967809,
"grad_norm": 0.4831303656101227,
"learning_rate": 8e-05,
"loss": 1.6037,
"step": 862
},
{
"epoch": 0.17923156801661475,
"grad_norm": 0.5041294693946838,
"learning_rate": 8e-05,
"loss": 1.6582,
"step": 863
},
{
"epoch": 0.17943925233644858,
"grad_norm": 0.5045524835586548,
"learning_rate": 8e-05,
"loss": 1.6141,
"step": 864
},
{
"epoch": 0.17964693665628245,
"grad_norm": 0.49055367708206177,
"learning_rate": 8e-05,
"loss": 1.6371,
"step": 865
},
{
"epoch": 0.1798546209761163,
"grad_norm": 0.5056166648864746,
"learning_rate": 8e-05,
"loss": 1.6949,
"step": 866
},
{
"epoch": 0.18006230529595016,
"grad_norm": 0.4675775170326233,
"learning_rate": 8e-05,
"loss": 1.5697,
"step": 867
},
{
"epoch": 0.180269989615784,
"grad_norm": 0.490346223115921,
"learning_rate": 8e-05,
"loss": 1.6092,
"step": 868
},
{
"epoch": 0.18047767393561787,
"grad_norm": 0.4684089720249176,
"learning_rate": 8e-05,
"loss": 1.5648,
"step": 869
},
{
"epoch": 0.1806853582554517,
"grad_norm": 0.46101874113082886,
"learning_rate": 8e-05,
"loss": 1.566,
"step": 870
},
{
"epoch": 0.18089304257528557,
"grad_norm": 0.4861421287059784,
"learning_rate": 8e-05,
"loss": 1.6511,
"step": 871
},
{
"epoch": 0.1811007268951194,
"grad_norm": 0.48315370082855225,
"learning_rate": 8e-05,
"loss": 1.6527,
"step": 872
},
{
"epoch": 0.18130841121495328,
"grad_norm": 0.48477229475975037,
"learning_rate": 8e-05,
"loss": 1.6477,
"step": 873
},
{
"epoch": 0.18151609553478712,
"grad_norm": 0.4834623336791992,
"learning_rate": 8e-05,
"loss": 1.5799,
"step": 874
},
{
"epoch": 0.181723779854621,
"grad_norm": 0.4903808534145355,
"learning_rate": 8e-05,
"loss": 1.6191,
"step": 875
},
{
"epoch": 0.18193146417445483,
"grad_norm": 0.4885776937007904,
"learning_rate": 8e-05,
"loss": 1.6815,
"step": 876
},
{
"epoch": 0.1821391484942887,
"grad_norm": 0.48751139640808105,
"learning_rate": 8e-05,
"loss": 1.6905,
"step": 877
},
{
"epoch": 0.18234683281412253,
"grad_norm": 0.48983898758888245,
"learning_rate": 8e-05,
"loss": 1.5902,
"step": 878
},
{
"epoch": 0.18255451713395637,
"grad_norm": 0.47432875633239746,
"learning_rate": 8e-05,
"loss": 1.6571,
"step": 879
},
{
"epoch": 0.18276220145379024,
"grad_norm": 0.4667503535747528,
"learning_rate": 8e-05,
"loss": 1.6198,
"step": 880
},
{
"epoch": 0.18296988577362408,
"grad_norm": 0.5021142363548279,
"learning_rate": 8e-05,
"loss": 1.7175,
"step": 881
},
{
"epoch": 0.18317757009345795,
"grad_norm": 0.46607160568237305,
"learning_rate": 8e-05,
"loss": 1.5967,
"step": 882
},
{
"epoch": 0.1833852544132918,
"grad_norm": 0.4718491733074188,
"learning_rate": 8e-05,
"loss": 1.5974,
"step": 883
},
{
"epoch": 0.18359293873312565,
"grad_norm": 0.49445825815200806,
"learning_rate": 8e-05,
"loss": 1.6271,
"step": 884
},
{
"epoch": 0.1838006230529595,
"grad_norm": 0.46848443150520325,
"learning_rate": 8e-05,
"loss": 1.6061,
"step": 885
},
{
"epoch": 0.18400830737279336,
"grad_norm": 0.46459993720054626,
"learning_rate": 8e-05,
"loss": 1.5613,
"step": 886
},
{
"epoch": 0.1842159916926272,
"grad_norm": 0.47281578183174133,
"learning_rate": 8e-05,
"loss": 1.704,
"step": 887
},
{
"epoch": 0.18442367601246107,
"grad_norm": 0.456195592880249,
"learning_rate": 8e-05,
"loss": 1.559,
"step": 888
},
{
"epoch": 0.1846313603322949,
"grad_norm": 0.5002410411834717,
"learning_rate": 8e-05,
"loss": 1.6179,
"step": 889
},
{
"epoch": 0.18483904465212878,
"grad_norm": 0.4889829158782959,
"learning_rate": 8e-05,
"loss": 1.6081,
"step": 890
},
{
"epoch": 0.18504672897196262,
"grad_norm": 0.5022076964378357,
"learning_rate": 8e-05,
"loss": 1.629,
"step": 891
},
{
"epoch": 0.18525441329179648,
"grad_norm": 0.496134489774704,
"learning_rate": 8e-05,
"loss": 1.611,
"step": 892
},
{
"epoch": 0.18546209761163032,
"grad_norm": 0.4800807237625122,
"learning_rate": 8e-05,
"loss": 1.68,
"step": 893
},
{
"epoch": 0.18566978193146416,
"grad_norm": 0.4833254814147949,
"learning_rate": 8e-05,
"loss": 1.6493,
"step": 894
},
{
"epoch": 0.18587746625129803,
"grad_norm": 0.47736915946006775,
"learning_rate": 8e-05,
"loss": 1.5986,
"step": 895
},
{
"epoch": 0.18608515057113187,
"grad_norm": 0.5091724395751953,
"learning_rate": 8e-05,
"loss": 1.6303,
"step": 896
},
{
"epoch": 0.18629283489096574,
"grad_norm": 0.46926015615463257,
"learning_rate": 8e-05,
"loss": 1.5778,
"step": 897
},
{
"epoch": 0.18650051921079958,
"grad_norm": 0.48769834637641907,
"learning_rate": 8e-05,
"loss": 1.6396,
"step": 898
},
{
"epoch": 0.18670820353063344,
"grad_norm": 0.47755125164985657,
"learning_rate": 8e-05,
"loss": 1.6374,
"step": 899
},
{
"epoch": 0.18691588785046728,
"grad_norm": 0.4844597280025482,
"learning_rate": 8e-05,
"loss": 1.6072,
"step": 900
},
{
"epoch": 0.18712357217030115,
"grad_norm": 0.5078486800193787,
"learning_rate": 8e-05,
"loss": 1.6898,
"step": 901
},
{
"epoch": 0.187331256490135,
"grad_norm": 0.4972101151943207,
"learning_rate": 8e-05,
"loss": 1.7502,
"step": 902
},
{
"epoch": 0.18753894080996886,
"grad_norm": 0.4829871952533722,
"learning_rate": 8e-05,
"loss": 1.615,
"step": 903
},
{
"epoch": 0.1877466251298027,
"grad_norm": 0.5250853300094604,
"learning_rate": 8e-05,
"loss": 1.6795,
"step": 904
},
{
"epoch": 0.18795430944963656,
"grad_norm": 0.4843866229057312,
"learning_rate": 8e-05,
"loss": 1.6664,
"step": 905
},
{
"epoch": 0.1881619937694704,
"grad_norm": 0.4833875894546509,
"learning_rate": 8e-05,
"loss": 1.5387,
"step": 906
},
{
"epoch": 0.18836967808930427,
"grad_norm": 0.5005854368209839,
"learning_rate": 8e-05,
"loss": 1.5622,
"step": 907
},
{
"epoch": 0.1885773624091381,
"grad_norm": 0.47214603424072266,
"learning_rate": 8e-05,
"loss": 1.6011,
"step": 908
},
{
"epoch": 0.18878504672897195,
"grad_norm": 0.5020946264266968,
"learning_rate": 8e-05,
"loss": 1.6588,
"step": 909
},
{
"epoch": 0.18899273104880582,
"grad_norm": 0.5007855892181396,
"learning_rate": 8e-05,
"loss": 1.6688,
"step": 910
},
{
"epoch": 0.18920041536863966,
"grad_norm": 0.5172010064125061,
"learning_rate": 8e-05,
"loss": 1.6751,
"step": 911
},
{
"epoch": 0.18940809968847352,
"grad_norm": 0.4866928160190582,
"learning_rate": 8e-05,
"loss": 1.56,
"step": 912
},
{
"epoch": 0.18961578400830736,
"grad_norm": 0.47611889243125916,
"learning_rate": 8e-05,
"loss": 1.5304,
"step": 913
},
{
"epoch": 0.18982346832814123,
"grad_norm": 0.4758463203907013,
"learning_rate": 8e-05,
"loss": 1.6201,
"step": 914
},
{
"epoch": 0.19003115264797507,
"grad_norm": 0.4987952709197998,
"learning_rate": 8e-05,
"loss": 1.6785,
"step": 915
},
{
"epoch": 0.19023883696780894,
"grad_norm": 0.5098257064819336,
"learning_rate": 8e-05,
"loss": 1.6758,
"step": 916
},
{
"epoch": 0.19044652128764278,
"grad_norm": 0.4878323972225189,
"learning_rate": 8e-05,
"loss": 1.5964,
"step": 917
},
{
"epoch": 0.19065420560747665,
"grad_norm": 0.49030864238739014,
"learning_rate": 8e-05,
"loss": 1.6166,
"step": 918
},
{
"epoch": 0.19086188992731049,
"grad_norm": 0.4641314446926117,
"learning_rate": 8e-05,
"loss": 1.6265,
"step": 919
},
{
"epoch": 0.19106957424714435,
"grad_norm": 0.49231404066085815,
"learning_rate": 8e-05,
"loss": 1.6691,
"step": 920
},
{
"epoch": 0.1912772585669782,
"grad_norm": 0.5089201331138611,
"learning_rate": 8e-05,
"loss": 1.6911,
"step": 921
},
{
"epoch": 0.19148494288681206,
"grad_norm": 0.4495040774345398,
"learning_rate": 8e-05,
"loss": 1.5579,
"step": 922
},
{
"epoch": 0.1916926272066459,
"grad_norm": 0.47652044892311096,
"learning_rate": 8e-05,
"loss": 1.5685,
"step": 923
},
{
"epoch": 0.19190031152647974,
"grad_norm": 0.4827858805656433,
"learning_rate": 8e-05,
"loss": 1.6168,
"step": 924
},
{
"epoch": 0.1921079958463136,
"grad_norm": 0.47965332865715027,
"learning_rate": 8e-05,
"loss": 1.5983,
"step": 925
},
{
"epoch": 0.19231568016614745,
"grad_norm": 0.4766390919685364,
"learning_rate": 8e-05,
"loss": 1.599,
"step": 926
},
{
"epoch": 0.1925233644859813,
"grad_norm": 0.48412805795669556,
"learning_rate": 8e-05,
"loss": 1.6749,
"step": 927
},
{
"epoch": 0.19273104880581515,
"grad_norm": 0.47555628418922424,
"learning_rate": 8e-05,
"loss": 1.5841,
"step": 928
},
{
"epoch": 0.19293873312564902,
"grad_norm": 0.48913460969924927,
"learning_rate": 8e-05,
"loss": 1.6512,
"step": 929
},
{
"epoch": 0.19314641744548286,
"grad_norm": 0.47544533014297485,
"learning_rate": 8e-05,
"loss": 1.6341,
"step": 930
},
{
"epoch": 0.19335410176531673,
"grad_norm": 0.4919319450855255,
"learning_rate": 8e-05,
"loss": 1.6381,
"step": 931
},
{
"epoch": 0.19356178608515057,
"grad_norm": 0.49583807587623596,
"learning_rate": 8e-05,
"loss": 1.6419,
"step": 932
},
{
"epoch": 0.19376947040498443,
"grad_norm": 0.5070441961288452,
"learning_rate": 8e-05,
"loss": 1.6657,
"step": 933
},
{
"epoch": 0.19397715472481827,
"grad_norm": 0.5061896443367004,
"learning_rate": 8e-05,
"loss": 1.6081,
"step": 934
},
{
"epoch": 0.19418483904465214,
"grad_norm": 0.5091527104377747,
"learning_rate": 8e-05,
"loss": 1.6912,
"step": 935
},
{
"epoch": 0.19439252336448598,
"grad_norm": 0.5117039084434509,
"learning_rate": 8e-05,
"loss": 1.632,
"step": 936
},
{
"epoch": 0.19460020768431982,
"grad_norm": 0.4791244864463806,
"learning_rate": 8e-05,
"loss": 1.6062,
"step": 937
},
{
"epoch": 0.1948078920041537,
"grad_norm": 0.5442696213722229,
"learning_rate": 8e-05,
"loss": 1.6536,
"step": 938
},
{
"epoch": 0.19501557632398753,
"grad_norm": 0.48307886719703674,
"learning_rate": 8e-05,
"loss": 1.5979,
"step": 939
},
{
"epoch": 0.1952232606438214,
"grad_norm": 0.5369094014167786,
"learning_rate": 8e-05,
"loss": 1.5968,
"step": 940
},
{
"epoch": 0.19543094496365523,
"grad_norm": 0.4949534237384796,
"learning_rate": 8e-05,
"loss": 1.6489,
"step": 941
},
{
"epoch": 0.1956386292834891,
"grad_norm": 0.5377048850059509,
"learning_rate": 8e-05,
"loss": 1.5288,
"step": 942
},
{
"epoch": 0.19584631360332294,
"grad_norm": 0.49918225407600403,
"learning_rate": 8e-05,
"loss": 1.5978,
"step": 943
},
{
"epoch": 0.1960539979231568,
"grad_norm": 0.5020906329154968,
"learning_rate": 8e-05,
"loss": 1.6248,
"step": 944
},
{
"epoch": 0.19626168224299065,
"grad_norm": 0.5110490918159485,
"learning_rate": 8e-05,
"loss": 1.5623,
"step": 945
},
{
"epoch": 0.19646936656282452,
"grad_norm": 0.4651744067668915,
"learning_rate": 8e-05,
"loss": 1.554,
"step": 946
},
{
"epoch": 0.19667705088265836,
"grad_norm": 0.5259853005409241,
"learning_rate": 8e-05,
"loss": 1.619,
"step": 947
},
{
"epoch": 0.19688473520249222,
"grad_norm": 0.5092069506645203,
"learning_rate": 8e-05,
"loss": 1.6075,
"step": 948
},
{
"epoch": 0.19709241952232606,
"grad_norm": 0.47999849915504456,
"learning_rate": 8e-05,
"loss": 1.4621,
"step": 949
},
{
"epoch": 0.19730010384215993,
"grad_norm": 0.5035858750343323,
"learning_rate": 8e-05,
"loss": 1.5704,
"step": 950
},
{
"epoch": 0.19750778816199377,
"grad_norm": 0.5029184222221375,
"learning_rate": 8e-05,
"loss": 1.691,
"step": 951
},
{
"epoch": 0.1977154724818276,
"grad_norm": 0.5592398047447205,
"learning_rate": 8e-05,
"loss": 1.6728,
"step": 952
},
{
"epoch": 0.19792315680166148,
"grad_norm": 0.5204004645347595,
"learning_rate": 8e-05,
"loss": 1.5816,
"step": 953
},
{
"epoch": 0.19813084112149532,
"grad_norm": 0.5047582983970642,
"learning_rate": 8e-05,
"loss": 1.6473,
"step": 954
},
{
"epoch": 0.19833852544132918,
"grad_norm": 0.5084308385848999,
"learning_rate": 8e-05,
"loss": 1.6297,
"step": 955
},
{
"epoch": 0.19854620976116302,
"grad_norm": 0.4851769506931305,
"learning_rate": 8e-05,
"loss": 1.676,
"step": 956
},
{
"epoch": 0.1987538940809969,
"grad_norm": 0.48817214369773865,
"learning_rate": 8e-05,
"loss": 1.6213,
"step": 957
},
{
"epoch": 0.19896157840083073,
"grad_norm": 0.5137952566146851,
"learning_rate": 8e-05,
"loss": 1.6807,
"step": 958
},
{
"epoch": 0.1991692627206646,
"grad_norm": 0.494804322719574,
"learning_rate": 8e-05,
"loss": 1.633,
"step": 959
},
{
"epoch": 0.19937694704049844,
"grad_norm": 0.4694535434246063,
"learning_rate": 8e-05,
"loss": 1.5782,
"step": 960
},
{
"epoch": 0.1995846313603323,
"grad_norm": 0.4649278223514557,
"learning_rate": 8e-05,
"loss": 1.5167,
"step": 961
},
{
"epoch": 0.19979231568016614,
"grad_norm": 0.49113568663597107,
"learning_rate": 8e-05,
"loss": 1.63,
"step": 962
},
{
"epoch": 0.2,
"grad_norm": 0.4700087308883667,
"learning_rate": 8e-05,
"loss": 1.5999,
"step": 963
},
{
"epoch": 0.20020768431983385,
"grad_norm": 0.4888930320739746,
"learning_rate": 8e-05,
"loss": 1.6966,
"step": 964
},
{
"epoch": 0.20041536863966772,
"grad_norm": 0.4866234362125397,
"learning_rate": 8e-05,
"loss": 1.6195,
"step": 965
},
{
"epoch": 0.20062305295950156,
"grad_norm": 0.5145866870880127,
"learning_rate": 8e-05,
"loss": 1.6721,
"step": 966
},
{
"epoch": 0.2008307372793354,
"grad_norm": 0.47302988171577454,
"learning_rate": 8e-05,
"loss": 1.6644,
"step": 967
},
{
"epoch": 0.20103842159916926,
"grad_norm": 0.48115986585617065,
"learning_rate": 8e-05,
"loss": 1.6557,
"step": 968
},
{
"epoch": 0.2012461059190031,
"grad_norm": 0.4676402807235718,
"learning_rate": 8e-05,
"loss": 1.5842,
"step": 969
},
{
"epoch": 0.20145379023883697,
"grad_norm": 0.4860267639160156,
"learning_rate": 8e-05,
"loss": 1.6519,
"step": 970
},
{
"epoch": 0.2016614745586708,
"grad_norm": 0.4892720580101013,
"learning_rate": 8e-05,
"loss": 1.5545,
"step": 971
},
{
"epoch": 0.20186915887850468,
"grad_norm": 0.46676620841026306,
"learning_rate": 8e-05,
"loss": 1.553,
"step": 972
},
{
"epoch": 0.20207684319833852,
"grad_norm": 0.47388455271720886,
"learning_rate": 8e-05,
"loss": 1.5923,
"step": 973
},
{
"epoch": 0.20228452751817239,
"grad_norm": 0.4880634546279907,
"learning_rate": 8e-05,
"loss": 1.5832,
"step": 974
},
{
"epoch": 0.20249221183800623,
"grad_norm": 0.4873626232147217,
"learning_rate": 8e-05,
"loss": 1.6224,
"step": 975
},
{
"epoch": 0.2026998961578401,
"grad_norm": 0.5022002458572388,
"learning_rate": 8e-05,
"loss": 1.6834,
"step": 976
},
{
"epoch": 0.20290758047767393,
"grad_norm": 0.47066470980644226,
"learning_rate": 8e-05,
"loss": 1.5887,
"step": 977
},
{
"epoch": 0.2031152647975078,
"grad_norm": 0.4675566554069519,
"learning_rate": 8e-05,
"loss": 1.6026,
"step": 978
},
{
"epoch": 0.20332294911734164,
"grad_norm": 0.5087757110595703,
"learning_rate": 8e-05,
"loss": 1.6192,
"step": 979
},
{
"epoch": 0.2035306334371755,
"grad_norm": 0.49187105894088745,
"learning_rate": 8e-05,
"loss": 1.6126,
"step": 980
},
{
"epoch": 0.20373831775700935,
"grad_norm": 0.4877180755138397,
"learning_rate": 8e-05,
"loss": 1.6781,
"step": 981
},
{
"epoch": 0.20394600207684319,
"grad_norm": 0.47267571091651917,
"learning_rate": 8e-05,
"loss": 1.646,
"step": 982
},
{
"epoch": 0.20415368639667705,
"grad_norm": 0.4767378270626068,
"learning_rate": 8e-05,
"loss": 1.5953,
"step": 983
},
{
"epoch": 0.2043613707165109,
"grad_norm": 0.48661693930625916,
"learning_rate": 8e-05,
"loss": 1.5469,
"step": 984
},
{
"epoch": 0.20456905503634476,
"grad_norm": 0.4928230047225952,
"learning_rate": 8e-05,
"loss": 1.7054,
"step": 985
},
{
"epoch": 0.2047767393561786,
"grad_norm": 0.4975184202194214,
"learning_rate": 8e-05,
"loss": 1.594,
"step": 986
},
{
"epoch": 0.20498442367601247,
"grad_norm": 0.4876691997051239,
"learning_rate": 8e-05,
"loss": 1.6356,
"step": 987
},
{
"epoch": 0.2051921079958463,
"grad_norm": 0.4785061776638031,
"learning_rate": 8e-05,
"loss": 1.6646,
"step": 988
},
{
"epoch": 0.20539979231568017,
"grad_norm": 0.49012821912765503,
"learning_rate": 8e-05,
"loss": 1.689,
"step": 989
},
{
"epoch": 0.205607476635514,
"grad_norm": 0.45855405926704407,
"learning_rate": 8e-05,
"loss": 1.6318,
"step": 990
},
{
"epoch": 0.20581516095534788,
"grad_norm": 0.4825817048549652,
"learning_rate": 8e-05,
"loss": 1.6189,
"step": 991
},
{
"epoch": 0.20602284527518172,
"grad_norm": 0.4723818004131317,
"learning_rate": 8e-05,
"loss": 1.5096,
"step": 992
},
{
"epoch": 0.2062305295950156,
"grad_norm": 0.46535414457321167,
"learning_rate": 8e-05,
"loss": 1.6112,
"step": 993
},
{
"epoch": 0.20643821391484943,
"grad_norm": 0.4890800714492798,
"learning_rate": 8e-05,
"loss": 1.6472,
"step": 994
},
{
"epoch": 0.2066458982346833,
"grad_norm": 0.48340532183647156,
"learning_rate": 8e-05,
"loss": 1.6347,
"step": 995
},
{
"epoch": 0.20685358255451713,
"grad_norm": 0.490472674369812,
"learning_rate": 8e-05,
"loss": 1.6745,
"step": 996
},
{
"epoch": 0.20706126687435097,
"grad_norm": 0.5247419476509094,
"learning_rate": 8e-05,
"loss": 1.7016,
"step": 997
},
{
"epoch": 0.20726895119418484,
"grad_norm": 0.4798457622528076,
"learning_rate": 8e-05,
"loss": 1.6581,
"step": 998
},
{
"epoch": 0.20747663551401868,
"grad_norm": 0.5022425651550293,
"learning_rate": 8e-05,
"loss": 1.6491,
"step": 999
},
{
"epoch": 0.20768431983385255,
"grad_norm": 0.5252692699432373,
"learning_rate": 8e-05,
"loss": 1.624,
"step": 1000
},
{
"epoch": 0.2078920041536864,
"grad_norm": 0.4893631041049957,
"learning_rate": 8e-05,
"loss": 1.6016,
"step": 1001
},
{
"epoch": 0.20809968847352026,
"grad_norm": 0.49601900577545166,
"learning_rate": 8e-05,
"loss": 1.6679,
"step": 1002
},
{
"epoch": 0.2083073727933541,
"grad_norm": 0.484885036945343,
"learning_rate": 8e-05,
"loss": 1.5687,
"step": 1003
},
{
"epoch": 0.20851505711318796,
"grad_norm": 0.4795399606227875,
"learning_rate": 8e-05,
"loss": 1.5629,
"step": 1004
},
{
"epoch": 0.2087227414330218,
"grad_norm": 0.5045424699783325,
"learning_rate": 8e-05,
"loss": 1.6474,
"step": 1005
},
{
"epoch": 0.20893042575285567,
"grad_norm": 0.49389609694480896,
"learning_rate": 8e-05,
"loss": 1.6446,
"step": 1006
},
{
"epoch": 0.2091381100726895,
"grad_norm": 0.5181095600128174,
"learning_rate": 8e-05,
"loss": 1.6578,
"step": 1007
},
{
"epoch": 0.20934579439252338,
"grad_norm": 0.4928913414478302,
"learning_rate": 8e-05,
"loss": 1.6385,
"step": 1008
},
{
"epoch": 0.20955347871235722,
"grad_norm": 0.5067183971405029,
"learning_rate": 8e-05,
"loss": 1.5462,
"step": 1009
},
{
"epoch": 0.20976116303219106,
"grad_norm": 0.46599841117858887,
"learning_rate": 8e-05,
"loss": 1.6113,
"step": 1010
},
{
"epoch": 0.20996884735202492,
"grad_norm": 0.5027761459350586,
"learning_rate": 8e-05,
"loss": 1.6009,
"step": 1011
},
{
"epoch": 0.21017653167185876,
"grad_norm": 0.5024246573448181,
"learning_rate": 8e-05,
"loss": 1.5819,
"step": 1012
},
{
"epoch": 0.21038421599169263,
"grad_norm": 0.4743015468120575,
"learning_rate": 8e-05,
"loss": 1.6115,
"step": 1013
},
{
"epoch": 0.21059190031152647,
"grad_norm": 0.5321120619773865,
"learning_rate": 8e-05,
"loss": 1.6409,
"step": 1014
},
{
"epoch": 0.21079958463136034,
"grad_norm": 0.4797392189502716,
"learning_rate": 8e-05,
"loss": 1.5711,
"step": 1015
},
{
"epoch": 0.21100726895119418,
"grad_norm": 0.5282681584358215,
"learning_rate": 8e-05,
"loss": 1.6369,
"step": 1016
},
{
"epoch": 0.21121495327102804,
"grad_norm": 0.4802643954753876,
"learning_rate": 8e-05,
"loss": 1.6363,
"step": 1017
},
{
"epoch": 0.21142263759086188,
"grad_norm": 0.479918897151947,
"learning_rate": 8e-05,
"loss": 1.6828,
"step": 1018
},
{
"epoch": 0.21163032191069575,
"grad_norm": 0.485012412071228,
"learning_rate": 8e-05,
"loss": 1.6451,
"step": 1019
},
{
"epoch": 0.2118380062305296,
"grad_norm": 0.47825416922569275,
"learning_rate": 8e-05,
"loss": 1.6492,
"step": 1020
},
{
"epoch": 0.21204569055036346,
"grad_norm": 0.48356735706329346,
"learning_rate": 8e-05,
"loss": 1.5729,
"step": 1021
},
{
"epoch": 0.2122533748701973,
"grad_norm": 0.4711926579475403,
"learning_rate": 8e-05,
"loss": 1.5919,
"step": 1022
},
{
"epoch": 0.21246105919003117,
"grad_norm": 0.4887542128562927,
"learning_rate": 8e-05,
"loss": 1.6684,
"step": 1023
},
{
"epoch": 0.212668743509865,
"grad_norm": 0.47914424538612366,
"learning_rate": 8e-05,
"loss": 1.5889,
"step": 1024
},
{
"epoch": 0.21287642782969884,
"grad_norm": 0.49408769607543945,
"learning_rate": 8e-05,
"loss": 1.663,
"step": 1025
},
{
"epoch": 0.2130841121495327,
"grad_norm": 0.49186617136001587,
"learning_rate": 8e-05,
"loss": 1.657,
"step": 1026
},
{
"epoch": 0.21329179646936655,
"grad_norm": 0.5070930123329163,
"learning_rate": 8e-05,
"loss": 1.6457,
"step": 1027
},
{
"epoch": 0.21349948078920042,
"grad_norm": 0.4801136553287506,
"learning_rate": 8e-05,
"loss": 1.6029,
"step": 1028
},
{
"epoch": 0.21370716510903426,
"grad_norm": 0.4819825291633606,
"learning_rate": 8e-05,
"loss": 1.6109,
"step": 1029
},
{
"epoch": 0.21391484942886813,
"grad_norm": 0.5037396550178528,
"learning_rate": 8e-05,
"loss": 1.6871,
"step": 1030
},
{
"epoch": 0.21412253374870197,
"grad_norm": 0.5130371451377869,
"learning_rate": 8e-05,
"loss": 1.7099,
"step": 1031
},
{
"epoch": 0.21433021806853583,
"grad_norm": 0.5002063512802124,
"learning_rate": 8e-05,
"loss": 1.6737,
"step": 1032
},
{
"epoch": 0.21453790238836967,
"grad_norm": 0.4774034917354584,
"learning_rate": 8e-05,
"loss": 1.6461,
"step": 1033
},
{
"epoch": 0.21474558670820354,
"grad_norm": 0.48661088943481445,
"learning_rate": 8e-05,
"loss": 1.5844,
"step": 1034
},
{
"epoch": 0.21495327102803738,
"grad_norm": 0.4912685453891754,
"learning_rate": 8e-05,
"loss": 1.6028,
"step": 1035
},
{
"epoch": 0.21516095534787125,
"grad_norm": 0.5158385634422302,
"learning_rate": 8e-05,
"loss": 1.6254,
"step": 1036
},
{
"epoch": 0.2153686396677051,
"grad_norm": 0.4866856038570404,
"learning_rate": 8e-05,
"loss": 1.654,
"step": 1037
},
{
"epoch": 0.21557632398753895,
"grad_norm": 0.5178074836730957,
"learning_rate": 8e-05,
"loss": 1.6212,
"step": 1038
},
{
"epoch": 0.2157840083073728,
"grad_norm": 0.48483380675315857,
"learning_rate": 8e-05,
"loss": 1.6116,
"step": 1039
},
{
"epoch": 0.21599169262720663,
"grad_norm": 0.47141504287719727,
"learning_rate": 8e-05,
"loss": 1.6042,
"step": 1040
},
{
"epoch": 0.2161993769470405,
"grad_norm": 0.5006888508796692,
"learning_rate": 8e-05,
"loss": 1.6242,
"step": 1041
},
{
"epoch": 0.21640706126687434,
"grad_norm": 0.47855380177497864,
"learning_rate": 8e-05,
"loss": 1.5661,
"step": 1042
},
{
"epoch": 0.2166147455867082,
"grad_norm": 0.5185720324516296,
"learning_rate": 8e-05,
"loss": 1.6747,
"step": 1043
},
{
"epoch": 0.21682242990654205,
"grad_norm": 0.4679183065891266,
"learning_rate": 8e-05,
"loss": 1.534,
"step": 1044
},
{
"epoch": 0.21703011422637591,
"grad_norm": 0.5030350685119629,
"learning_rate": 8e-05,
"loss": 1.624,
"step": 1045
},
{
"epoch": 0.21723779854620975,
"grad_norm": 0.48393726348876953,
"learning_rate": 8e-05,
"loss": 1.6531,
"step": 1046
},
{
"epoch": 0.21744548286604362,
"grad_norm": 0.4825136065483093,
"learning_rate": 8e-05,
"loss": 1.6242,
"step": 1047
},
{
"epoch": 0.21765316718587746,
"grad_norm": 0.4966621696949005,
"learning_rate": 8e-05,
"loss": 1.6368,
"step": 1048
},
{
"epoch": 0.21786085150571133,
"grad_norm": 0.48175907135009766,
"learning_rate": 8e-05,
"loss": 1.6121,
"step": 1049
},
{
"epoch": 0.21806853582554517,
"grad_norm": 0.4931303858757019,
"learning_rate": 8e-05,
"loss": 1.6452,
"step": 1050
},
{
"epoch": 0.21827622014537904,
"grad_norm": 0.4742048680782318,
"learning_rate": 8e-05,
"loss": 1.6037,
"step": 1051
},
{
"epoch": 0.21848390446521287,
"grad_norm": 0.4972934126853943,
"learning_rate": 8e-05,
"loss": 1.6522,
"step": 1052
},
{
"epoch": 0.21869158878504674,
"grad_norm": 0.49021288752555847,
"learning_rate": 8e-05,
"loss": 1.6386,
"step": 1053
},
{
"epoch": 0.21889927310488058,
"grad_norm": 0.4739423096179962,
"learning_rate": 8e-05,
"loss": 1.6267,
"step": 1054
},
{
"epoch": 0.21910695742471442,
"grad_norm": 0.49233463406562805,
"learning_rate": 8e-05,
"loss": 1.6757,
"step": 1055
},
{
"epoch": 0.2193146417445483,
"grad_norm": 0.4631211757659912,
"learning_rate": 8e-05,
"loss": 1.5198,
"step": 1056
},
{
"epoch": 0.21952232606438213,
"grad_norm": 0.5057953596115112,
"learning_rate": 8e-05,
"loss": 1.6831,
"step": 1057
},
{
"epoch": 0.219730010384216,
"grad_norm": 0.46444934606552124,
"learning_rate": 8e-05,
"loss": 1.6107,
"step": 1058
},
{
"epoch": 0.21993769470404984,
"grad_norm": 0.48006969690322876,
"learning_rate": 8e-05,
"loss": 1.6356,
"step": 1059
},
{
"epoch": 0.2201453790238837,
"grad_norm": 0.47697484493255615,
"learning_rate": 8e-05,
"loss": 1.605,
"step": 1060
},
{
"epoch": 0.22035306334371754,
"grad_norm": 0.45909324288368225,
"learning_rate": 8e-05,
"loss": 1.6047,
"step": 1061
},
{
"epoch": 0.2205607476635514,
"grad_norm": 0.45758333802223206,
"learning_rate": 8e-05,
"loss": 1.5597,
"step": 1062
},
{
"epoch": 0.22076843198338525,
"grad_norm": 0.49077147245407104,
"learning_rate": 8e-05,
"loss": 1.6428,
"step": 1063
},
{
"epoch": 0.22097611630321912,
"grad_norm": 0.49720385670661926,
"learning_rate": 8e-05,
"loss": 1.6476,
"step": 1064
},
{
"epoch": 0.22118380062305296,
"grad_norm": 0.5079086422920227,
"learning_rate": 8e-05,
"loss": 1.6564,
"step": 1065
},
{
"epoch": 0.22139148494288682,
"grad_norm": 0.47034189105033875,
"learning_rate": 8e-05,
"loss": 1.5323,
"step": 1066
},
{
"epoch": 0.22159916926272066,
"grad_norm": 0.5000166893005371,
"learning_rate": 8e-05,
"loss": 1.6268,
"step": 1067
},
{
"epoch": 0.22180685358255453,
"grad_norm": 0.47734546661376953,
"learning_rate": 8e-05,
"loss": 1.6071,
"step": 1068
},
{
"epoch": 0.22201453790238837,
"grad_norm": 0.49549058079719543,
"learning_rate": 8e-05,
"loss": 1.6152,
"step": 1069
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.4837387502193451,
"learning_rate": 8e-05,
"loss": 1.6049,
"step": 1070
},
{
"epoch": 0.22242990654205608,
"grad_norm": 0.46780574321746826,
"learning_rate": 8e-05,
"loss": 1.5809,
"step": 1071
},
{
"epoch": 0.22263759086188992,
"grad_norm": 0.4783675968647003,
"learning_rate": 8e-05,
"loss": 1.5655,
"step": 1072
},
{
"epoch": 0.22284527518172378,
"grad_norm": 0.4945738613605499,
"learning_rate": 8e-05,
"loss": 1.6605,
"step": 1073
},
{
"epoch": 0.22305295950155762,
"grad_norm": 0.5201888084411621,
"learning_rate": 8e-05,
"loss": 1.6003,
"step": 1074
},
{
"epoch": 0.2232606438213915,
"grad_norm": 0.5067545175552368,
"learning_rate": 8e-05,
"loss": 1.6575,
"step": 1075
},
{
"epoch": 0.22346832814122533,
"grad_norm": 0.4898606538772583,
"learning_rate": 8e-05,
"loss": 1.6554,
"step": 1076
},
{
"epoch": 0.2236760124610592,
"grad_norm": 0.5530943274497986,
"learning_rate": 8e-05,
"loss": 1.6715,
"step": 1077
},
{
"epoch": 0.22388369678089304,
"grad_norm": 0.4874219596385956,
"learning_rate": 8e-05,
"loss": 1.6182,
"step": 1078
},
{
"epoch": 0.2240913811007269,
"grad_norm": 0.5321915745735168,
"learning_rate": 8e-05,
"loss": 1.6416,
"step": 1079
},
{
"epoch": 0.22429906542056074,
"grad_norm": 0.46676793694496155,
"learning_rate": 8e-05,
"loss": 1.6459,
"step": 1080
},
{
"epoch": 0.2245067497403946,
"grad_norm": 0.4740728735923767,
"learning_rate": 8e-05,
"loss": 1.5671,
"step": 1081
},
{
"epoch": 0.22471443406022845,
"grad_norm": 0.48530635237693787,
"learning_rate": 8e-05,
"loss": 1.5294,
"step": 1082
},
{
"epoch": 0.22492211838006232,
"grad_norm": 0.4611906111240387,
"learning_rate": 8e-05,
"loss": 1.6403,
"step": 1083
},
{
"epoch": 0.22512980269989616,
"grad_norm": 0.4903087913990021,
"learning_rate": 8e-05,
"loss": 1.6264,
"step": 1084
},
{
"epoch": 0.22533748701973,
"grad_norm": 0.486613392829895,
"learning_rate": 8e-05,
"loss": 1.6315,
"step": 1085
},
{
"epoch": 0.22554517133956387,
"grad_norm": 0.5356956124305725,
"learning_rate": 8e-05,
"loss": 1.6754,
"step": 1086
},
{
"epoch": 0.2257528556593977,
"grad_norm": 0.5136194825172424,
"learning_rate": 8e-05,
"loss": 1.6052,
"step": 1087
},
{
"epoch": 0.22596053997923157,
"grad_norm": 0.49597111344337463,
"learning_rate": 8e-05,
"loss": 1.6403,
"step": 1088
},
{
"epoch": 0.2261682242990654,
"grad_norm": 0.48345381021499634,
"learning_rate": 8e-05,
"loss": 1.6072,
"step": 1089
},
{
"epoch": 0.22637590861889928,
"grad_norm": 0.505710780620575,
"learning_rate": 8e-05,
"loss": 1.5855,
"step": 1090
},
{
"epoch": 0.22658359293873312,
"grad_norm": 0.5041759014129639,
"learning_rate": 8e-05,
"loss": 1.6558,
"step": 1091
},
{
"epoch": 0.226791277258567,
"grad_norm": 0.5124122500419617,
"learning_rate": 8e-05,
"loss": 1.6171,
"step": 1092
},
{
"epoch": 0.22699896157840083,
"grad_norm": 0.5028002262115479,
"learning_rate": 8e-05,
"loss": 1.667,
"step": 1093
},
{
"epoch": 0.2272066458982347,
"grad_norm": 0.5096011161804199,
"learning_rate": 8e-05,
"loss": 1.6043,
"step": 1094
},
{
"epoch": 0.22741433021806853,
"grad_norm": 0.4749337136745453,
"learning_rate": 8e-05,
"loss": 1.6256,
"step": 1095
},
{
"epoch": 0.2276220145379024,
"grad_norm": 0.4887832701206207,
"learning_rate": 8e-05,
"loss": 1.6197,
"step": 1096
},
{
"epoch": 0.22782969885773624,
"grad_norm": 0.4867877662181854,
"learning_rate": 8e-05,
"loss": 1.6318,
"step": 1097
},
{
"epoch": 0.22803738317757008,
"grad_norm": 0.45605990290641785,
"learning_rate": 8e-05,
"loss": 1.5584,
"step": 1098
},
{
"epoch": 0.22824506749740395,
"grad_norm": 0.48189279437065125,
"learning_rate": 8e-05,
"loss": 1.618,
"step": 1099
},
{
"epoch": 0.2284527518172378,
"grad_norm": 0.510169506072998,
"learning_rate": 8e-05,
"loss": 1.6289,
"step": 1100
},
{
"epoch": 0.22866043613707165,
"grad_norm": 0.47950589656829834,
"learning_rate": 8e-05,
"loss": 1.6601,
"step": 1101
},
{
"epoch": 0.2288681204569055,
"grad_norm": 0.5073772072792053,
"learning_rate": 8e-05,
"loss": 1.6404,
"step": 1102
},
{
"epoch": 0.22907580477673936,
"grad_norm": 0.47775015234947205,
"learning_rate": 8e-05,
"loss": 1.6235,
"step": 1103
},
{
"epoch": 0.2292834890965732,
"grad_norm": 0.4975552558898926,
"learning_rate": 8e-05,
"loss": 1.6022,
"step": 1104
},
{
"epoch": 0.22949117341640707,
"grad_norm": 0.49373266100883484,
"learning_rate": 8e-05,
"loss": 1.631,
"step": 1105
},
{
"epoch": 0.2296988577362409,
"grad_norm": 0.5255943536758423,
"learning_rate": 8e-05,
"loss": 1.6067,
"step": 1106
},
{
"epoch": 0.22990654205607478,
"grad_norm": 0.4758288264274597,
"learning_rate": 8e-05,
"loss": 1.5626,
"step": 1107
},
{
"epoch": 0.23011422637590861,
"grad_norm": 0.4850333631038666,
"learning_rate": 8e-05,
"loss": 1.5805,
"step": 1108
},
{
"epoch": 0.23032191069574248,
"grad_norm": 0.4803512990474701,
"learning_rate": 8e-05,
"loss": 1.5439,
"step": 1109
},
{
"epoch": 0.23052959501557632,
"grad_norm": 0.4789465665817261,
"learning_rate": 8e-05,
"loss": 1.6022,
"step": 1110
},
{
"epoch": 0.2307372793354102,
"grad_norm": 0.5042217969894409,
"learning_rate": 8e-05,
"loss": 1.6692,
"step": 1111
},
{
"epoch": 0.23094496365524403,
"grad_norm": 0.497745543718338,
"learning_rate": 8e-05,
"loss": 1.6743,
"step": 1112
},
{
"epoch": 0.23115264797507787,
"grad_norm": 0.45986318588256836,
"learning_rate": 8e-05,
"loss": 1.5806,
"step": 1113
},
{
"epoch": 0.23136033229491174,
"grad_norm": 0.4838648736476898,
"learning_rate": 8e-05,
"loss": 1.653,
"step": 1114
},
{
"epoch": 0.23156801661474558,
"grad_norm": 0.4738873243331909,
"learning_rate": 8e-05,
"loss": 1.5853,
"step": 1115
},
{
"epoch": 0.23177570093457944,
"grad_norm": 0.48498794436454773,
"learning_rate": 8e-05,
"loss": 1.673,
"step": 1116
},
{
"epoch": 0.23198338525441328,
"grad_norm": 0.4791082739830017,
"learning_rate": 8e-05,
"loss": 1.5446,
"step": 1117
},
{
"epoch": 0.23219106957424715,
"grad_norm": 0.4974493086338043,
"learning_rate": 8e-05,
"loss": 1.5747,
"step": 1118
},
{
"epoch": 0.232398753894081,
"grad_norm": 0.5050761103630066,
"learning_rate": 8e-05,
"loss": 1.5965,
"step": 1119
},
{
"epoch": 0.23260643821391486,
"grad_norm": 0.47682154178619385,
"learning_rate": 8e-05,
"loss": 1.581,
"step": 1120
},
{
"epoch": 0.2328141225337487,
"grad_norm": 0.5180225372314453,
"learning_rate": 8e-05,
"loss": 1.6094,
"step": 1121
},
{
"epoch": 0.23302180685358256,
"grad_norm": 0.5043385028839111,
"learning_rate": 8e-05,
"loss": 1.6338,
"step": 1122
},
{
"epoch": 0.2332294911734164,
"grad_norm": 0.5274357199668884,
"learning_rate": 8e-05,
"loss": 1.6203,
"step": 1123
},
{
"epoch": 0.23343717549325027,
"grad_norm": 0.49157971143722534,
"learning_rate": 8e-05,
"loss": 1.5854,
"step": 1124
},
{
"epoch": 0.2336448598130841,
"grad_norm": 0.5090160369873047,
"learning_rate": 8e-05,
"loss": 1.6155,
"step": 1125
},
{
"epoch": 0.23385254413291798,
"grad_norm": 0.4960342049598694,
"learning_rate": 8e-05,
"loss": 1.6501,
"step": 1126
},
{
"epoch": 0.23406022845275182,
"grad_norm": 0.49579086899757385,
"learning_rate": 8e-05,
"loss": 1.6299,
"step": 1127
},
{
"epoch": 0.23426791277258566,
"grad_norm": 0.5103940963745117,
"learning_rate": 8e-05,
"loss": 1.5807,
"step": 1128
},
{
"epoch": 0.23447559709241952,
"grad_norm": 0.47810351848602295,
"learning_rate": 8e-05,
"loss": 1.6593,
"step": 1129
},
{
"epoch": 0.23468328141225336,
"grad_norm": 0.48690786957740784,
"learning_rate": 8e-05,
"loss": 1.5911,
"step": 1130
},
{
"epoch": 0.23489096573208723,
"grad_norm": 0.5075724720954895,
"learning_rate": 8e-05,
"loss": 1.6673,
"step": 1131
},
{
"epoch": 0.23509865005192107,
"grad_norm": 0.5023790001869202,
"learning_rate": 8e-05,
"loss": 1.6244,
"step": 1132
},
{
"epoch": 0.23530633437175494,
"grad_norm": 0.49479010701179504,
"learning_rate": 8e-05,
"loss": 1.5765,
"step": 1133
},
{
"epoch": 0.23551401869158878,
"grad_norm": 0.47907403111457825,
"learning_rate": 8e-05,
"loss": 1.6108,
"step": 1134
},
{
"epoch": 0.23572170301142265,
"grad_norm": 0.5096808075904846,
"learning_rate": 8e-05,
"loss": 1.5931,
"step": 1135
},
{
"epoch": 0.23592938733125648,
"grad_norm": 0.4935376048088074,
"learning_rate": 8e-05,
"loss": 1.6679,
"step": 1136
},
{
"epoch": 0.23613707165109035,
"grad_norm": 0.4955846667289734,
"learning_rate": 8e-05,
"loss": 1.6786,
"step": 1137
},
{
"epoch": 0.2363447559709242,
"grad_norm": 0.5047092437744141,
"learning_rate": 8e-05,
"loss": 1.5719,
"step": 1138
},
{
"epoch": 0.23655244029075806,
"grad_norm": 0.4767501950263977,
"learning_rate": 8e-05,
"loss": 1.5391,
"step": 1139
},
{
"epoch": 0.2367601246105919,
"grad_norm": 0.49148985743522644,
"learning_rate": 8e-05,
"loss": 1.6281,
"step": 1140
},
{
"epoch": 0.23696780893042577,
"grad_norm": 0.4789850413799286,
"learning_rate": 8e-05,
"loss": 1.541,
"step": 1141
},
{
"epoch": 0.2371754932502596,
"grad_norm": 0.47179505228996277,
"learning_rate": 8e-05,
"loss": 1.56,
"step": 1142
},
{
"epoch": 0.23738317757009345,
"grad_norm": 0.4852232038974762,
"learning_rate": 8e-05,
"loss": 1.6166,
"step": 1143
},
{
"epoch": 0.2375908618899273,
"grad_norm": 0.4789161682128906,
"learning_rate": 8e-05,
"loss": 1.6051,
"step": 1144
},
{
"epoch": 0.23779854620976115,
"grad_norm": 0.5053293108940125,
"learning_rate": 8e-05,
"loss": 1.6531,
"step": 1145
},
{
"epoch": 0.23800623052959502,
"grad_norm": 0.48226839303970337,
"learning_rate": 8e-05,
"loss": 1.6138,
"step": 1146
},
{
"epoch": 0.23821391484942886,
"grad_norm": 0.4766741693019867,
"learning_rate": 8e-05,
"loss": 1.5757,
"step": 1147
},
{
"epoch": 0.23842159916926273,
"grad_norm": 0.4817132353782654,
"learning_rate": 8e-05,
"loss": 1.5919,
"step": 1148
},
{
"epoch": 0.23862928348909657,
"grad_norm": 0.48039570450782776,
"learning_rate": 8e-05,
"loss": 1.5237,
"step": 1149
},
{
"epoch": 0.23883696780893043,
"grad_norm": 0.539954662322998,
"learning_rate": 8e-05,
"loss": 1.7231,
"step": 1150
},
{
"epoch": 0.23904465212876427,
"grad_norm": 0.49873772263526917,
"learning_rate": 8e-05,
"loss": 1.6099,
"step": 1151
},
{
"epoch": 0.23925233644859814,
"grad_norm": 0.5055965185165405,
"learning_rate": 8e-05,
"loss": 1.6688,
"step": 1152
},
{
"epoch": 0.23946002076843198,
"grad_norm": 0.4793665409088135,
"learning_rate": 8e-05,
"loss": 1.6186,
"step": 1153
},
{
"epoch": 0.23966770508826585,
"grad_norm": 0.49260565638542175,
"learning_rate": 8e-05,
"loss": 1.5712,
"step": 1154
},
{
"epoch": 0.2398753894080997,
"grad_norm": 0.48806118965148926,
"learning_rate": 8e-05,
"loss": 1.6346,
"step": 1155
},
{
"epoch": 0.24008307372793355,
"grad_norm": 0.4877453148365021,
"learning_rate": 8e-05,
"loss": 1.5825,
"step": 1156
},
{
"epoch": 0.2402907580477674,
"grad_norm": 0.4928903877735138,
"learning_rate": 8e-05,
"loss": 1.6033,
"step": 1157
},
{
"epoch": 0.24049844236760123,
"grad_norm": 0.4725443124771118,
"learning_rate": 8e-05,
"loss": 1.6135,
"step": 1158
},
{
"epoch": 0.2407061266874351,
"grad_norm": 0.47031083703041077,
"learning_rate": 8e-05,
"loss": 1.6,
"step": 1159
},
{
"epoch": 0.24091381100726894,
"grad_norm": 0.46935370564460754,
"learning_rate": 8e-05,
"loss": 1.574,
"step": 1160
},
{
"epoch": 0.2411214953271028,
"grad_norm": 0.4930771291255951,
"learning_rate": 8e-05,
"loss": 1.5769,
"step": 1161
},
{
"epoch": 0.24132917964693665,
"grad_norm": 0.4878680408000946,
"learning_rate": 8e-05,
"loss": 1.6057,
"step": 1162
},
{
"epoch": 0.24153686396677052,
"grad_norm": 0.46639183163642883,
"learning_rate": 8e-05,
"loss": 1.5862,
"step": 1163
},
{
"epoch": 0.24174454828660435,
"grad_norm": 0.48250576853752136,
"learning_rate": 8e-05,
"loss": 1.5948,
"step": 1164
},
{
"epoch": 0.24195223260643822,
"grad_norm": 0.4780838191509247,
"learning_rate": 8e-05,
"loss": 1.5429,
"step": 1165
},
{
"epoch": 0.24215991692627206,
"grad_norm": 0.4783189296722412,
"learning_rate": 8e-05,
"loss": 1.6205,
"step": 1166
},
{
"epoch": 0.24236760124610593,
"grad_norm": 0.4721772372722626,
"learning_rate": 8e-05,
"loss": 1.5667,
"step": 1167
},
{
"epoch": 0.24257528556593977,
"grad_norm": 0.49488070607185364,
"learning_rate": 8e-05,
"loss": 1.6791,
"step": 1168
},
{
"epoch": 0.24278296988577364,
"grad_norm": 0.49183887243270874,
"learning_rate": 8e-05,
"loss": 1.6051,
"step": 1169
},
{
"epoch": 0.24299065420560748,
"grad_norm": 0.4702599346637726,
"learning_rate": 8e-05,
"loss": 1.5553,
"step": 1170
},
{
"epoch": 0.24319833852544132,
"grad_norm": 0.48021695017814636,
"learning_rate": 8e-05,
"loss": 1.5804,
"step": 1171
},
{
"epoch": 0.24340602284527518,
"grad_norm": 0.5404965281486511,
"learning_rate": 8e-05,
"loss": 1.6724,
"step": 1172
},
{
"epoch": 0.24361370716510902,
"grad_norm": 0.5167496204376221,
"learning_rate": 8e-05,
"loss": 1.5631,
"step": 1173
},
{
"epoch": 0.2438213914849429,
"grad_norm": 0.4974764883518219,
"learning_rate": 8e-05,
"loss": 1.5615,
"step": 1174
},
{
"epoch": 0.24402907580477673,
"grad_norm": 0.4905969202518463,
"learning_rate": 8e-05,
"loss": 1.6023,
"step": 1175
},
{
"epoch": 0.2442367601246106,
"grad_norm": 0.5358709692955017,
"learning_rate": 8e-05,
"loss": 1.5555,
"step": 1176
},
{
"epoch": 0.24444444444444444,
"grad_norm": 0.49605727195739746,
"learning_rate": 8e-05,
"loss": 1.6356,
"step": 1177
},
{
"epoch": 0.2446521287642783,
"grad_norm": 0.4891799986362457,
"learning_rate": 8e-05,
"loss": 1.6877,
"step": 1178
},
{
"epoch": 0.24485981308411214,
"grad_norm": 0.4895115792751312,
"learning_rate": 8e-05,
"loss": 1.5594,
"step": 1179
},
{
"epoch": 0.245067497403946,
"grad_norm": 0.49354806542396545,
"learning_rate": 8e-05,
"loss": 1.6815,
"step": 1180
},
{
"epoch": 0.24527518172377985,
"grad_norm": 0.4897477924823761,
"learning_rate": 8e-05,
"loss": 1.6378,
"step": 1181
},
{
"epoch": 0.24548286604361372,
"grad_norm": 0.4873749613761902,
"learning_rate": 8e-05,
"loss": 1.5757,
"step": 1182
},
{
"epoch": 0.24569055036344756,
"grad_norm": 0.464875191450119,
"learning_rate": 8e-05,
"loss": 1.5587,
"step": 1183
},
{
"epoch": 0.24589823468328142,
"grad_norm": 0.4927029311656952,
"learning_rate": 8e-05,
"loss": 1.6541,
"step": 1184
},
{
"epoch": 0.24610591900311526,
"grad_norm": 0.4638020694255829,
"learning_rate": 8e-05,
"loss": 1.5619,
"step": 1185
},
{
"epoch": 0.2463136033229491,
"grad_norm": 0.4894510805606842,
"learning_rate": 8e-05,
"loss": 1.5729,
"step": 1186
},
{
"epoch": 0.24652128764278297,
"grad_norm": 0.4695272743701935,
"learning_rate": 8e-05,
"loss": 1.6059,
"step": 1187
},
{
"epoch": 0.2467289719626168,
"grad_norm": 0.47015345096588135,
"learning_rate": 8e-05,
"loss": 1.6093,
"step": 1188
},
{
"epoch": 0.24693665628245068,
"grad_norm": 0.4933643043041229,
"learning_rate": 8e-05,
"loss": 1.5702,
"step": 1189
},
{
"epoch": 0.24714434060228452,
"grad_norm": 0.4731082022190094,
"learning_rate": 8e-05,
"loss": 1.5177,
"step": 1190
},
{
"epoch": 0.24735202492211839,
"grad_norm": 0.48732051253318787,
"learning_rate": 8e-05,
"loss": 1.6784,
"step": 1191
},
{
"epoch": 0.24755970924195222,
"grad_norm": 0.5021098256111145,
"learning_rate": 8e-05,
"loss": 1.6564,
"step": 1192
},
{
"epoch": 0.2477673935617861,
"grad_norm": 0.48108044266700745,
"learning_rate": 8e-05,
"loss": 1.6051,
"step": 1193
},
{
"epoch": 0.24797507788161993,
"grad_norm": 0.47929883003234863,
"learning_rate": 8e-05,
"loss": 1.6104,
"step": 1194
},
{
"epoch": 0.2481827622014538,
"grad_norm": 0.502677321434021,
"learning_rate": 8e-05,
"loss": 1.6059,
"step": 1195
},
{
"epoch": 0.24839044652128764,
"grad_norm": 0.49433907866477966,
"learning_rate": 8e-05,
"loss": 1.6257,
"step": 1196
},
{
"epoch": 0.2485981308411215,
"grad_norm": 0.4825282394886017,
"learning_rate": 8e-05,
"loss": 1.6134,
"step": 1197
},
{
"epoch": 0.24880581516095535,
"grad_norm": 0.4730031192302704,
"learning_rate": 8e-05,
"loss": 1.581,
"step": 1198
},
{
"epoch": 0.2490134994807892,
"grad_norm": 0.4965001940727234,
"learning_rate": 8e-05,
"loss": 1.6166,
"step": 1199
},
{
"epoch": 0.24922118380062305,
"grad_norm": 0.49881839752197266,
"learning_rate": 8e-05,
"loss": 1.6278,
"step": 1200
},
{
"epoch": 0.2494288681204569,
"grad_norm": 0.484718382358551,
"learning_rate": 8e-05,
"loss": 1.6499,
"step": 1201
},
{
"epoch": 0.24963655244029076,
"grad_norm": 0.4792778491973877,
"learning_rate": 8e-05,
"loss": 1.6444,
"step": 1202
},
{
"epoch": 0.2498442367601246,
"grad_norm": 0.4959498643875122,
"learning_rate": 8e-05,
"loss": 1.6745,
"step": 1203
},
{
"epoch": 0.25005192107995844,
"grad_norm": 0.48365911841392517,
"learning_rate": 8e-05,
"loss": 1.5993,
"step": 1204
},
{
"epoch": 0.2502596053997923,
"grad_norm": 0.5093377828598022,
"learning_rate": 8e-05,
"loss": 1.628,
"step": 1205
},
{
"epoch": 0.2504672897196262,
"grad_norm": 0.486716091632843,
"learning_rate": 8e-05,
"loss": 1.6436,
"step": 1206
},
{
"epoch": 0.25067497403946004,
"grad_norm": 0.4940541982650757,
"learning_rate": 8e-05,
"loss": 1.5606,
"step": 1207
},
{
"epoch": 0.25088265835929385,
"grad_norm": 0.4985447824001312,
"learning_rate": 8e-05,
"loss": 1.7143,
"step": 1208
},
{
"epoch": 0.2510903426791277,
"grad_norm": 0.49395254254341125,
"learning_rate": 8e-05,
"loss": 1.5876,
"step": 1209
},
{
"epoch": 0.2512980269989616,
"grad_norm": 0.4942306876182556,
"learning_rate": 8e-05,
"loss": 1.6473,
"step": 1210
},
{
"epoch": 0.25150571131879546,
"grad_norm": 0.48080599308013916,
"learning_rate": 8e-05,
"loss": 1.6188,
"step": 1211
},
{
"epoch": 0.25171339563862927,
"grad_norm": 0.4913437068462372,
"learning_rate": 8e-05,
"loss": 1.6626,
"step": 1212
},
{
"epoch": 0.25192107995846313,
"grad_norm": 0.4762611985206604,
"learning_rate": 8e-05,
"loss": 1.5715,
"step": 1213
},
{
"epoch": 0.252128764278297,
"grad_norm": 0.49467694759368896,
"learning_rate": 8e-05,
"loss": 1.6428,
"step": 1214
},
{
"epoch": 0.2523364485981308,
"grad_norm": 0.49399566650390625,
"learning_rate": 8e-05,
"loss": 1.5658,
"step": 1215
},
{
"epoch": 0.2525441329179647,
"grad_norm": 0.47850465774536133,
"learning_rate": 8e-05,
"loss": 1.6057,
"step": 1216
},
{
"epoch": 0.25275181723779855,
"grad_norm": 0.5025778412818909,
"learning_rate": 8e-05,
"loss": 1.6746,
"step": 1217
},
{
"epoch": 0.2529595015576324,
"grad_norm": 0.4746226966381073,
"learning_rate": 8e-05,
"loss": 1.5973,
"step": 1218
},
{
"epoch": 0.2531671858774662,
"grad_norm": 0.490772008895874,
"learning_rate": 8e-05,
"loss": 1.6176,
"step": 1219
},
{
"epoch": 0.2533748701973001,
"grad_norm": 0.477453351020813,
"learning_rate": 8e-05,
"loss": 1.6027,
"step": 1220
},
{
"epoch": 0.25358255451713396,
"grad_norm": 0.48569169640541077,
"learning_rate": 8e-05,
"loss": 1.6144,
"step": 1221
},
{
"epoch": 0.25379023883696783,
"grad_norm": 0.48016172647476196,
"learning_rate": 8e-05,
"loss": 1.612,
"step": 1222
},
{
"epoch": 0.25399792315680164,
"grad_norm": 0.458385169506073,
"learning_rate": 8e-05,
"loss": 1.4938,
"step": 1223
},
{
"epoch": 0.2542056074766355,
"grad_norm": 0.4843856692314148,
"learning_rate": 8e-05,
"loss": 1.5878,
"step": 1224
},
{
"epoch": 0.2544132917964694,
"grad_norm": 0.5237738490104675,
"learning_rate": 8e-05,
"loss": 1.6753,
"step": 1225
},
{
"epoch": 0.25462097611630324,
"grad_norm": 0.5184034109115601,
"learning_rate": 8e-05,
"loss": 1.5755,
"step": 1226
},
{
"epoch": 0.25482866043613706,
"grad_norm": 0.5014573931694031,
"learning_rate": 8e-05,
"loss": 1.6439,
"step": 1227
},
{
"epoch": 0.2550363447559709,
"grad_norm": 0.4746113419532776,
"learning_rate": 8e-05,
"loss": 1.6612,
"step": 1228
},
{
"epoch": 0.2552440290758048,
"grad_norm": 0.48897260427474976,
"learning_rate": 8e-05,
"loss": 1.5836,
"step": 1229
},
{
"epoch": 0.2554517133956386,
"grad_norm": 0.47817346453666687,
"learning_rate": 8e-05,
"loss": 1.5826,
"step": 1230
},
{
"epoch": 0.25565939771547247,
"grad_norm": 0.5092668533325195,
"learning_rate": 8e-05,
"loss": 1.6462,
"step": 1231
},
{
"epoch": 0.25586708203530634,
"grad_norm": 0.4775659739971161,
"learning_rate": 8e-05,
"loss": 1.5857,
"step": 1232
},
{
"epoch": 0.2560747663551402,
"grad_norm": 0.5099481344223022,
"learning_rate": 8e-05,
"loss": 1.592,
"step": 1233
},
{
"epoch": 0.256282450674974,
"grad_norm": 0.4921092987060547,
"learning_rate": 8e-05,
"loss": 1.5091,
"step": 1234
},
{
"epoch": 0.2564901349948079,
"grad_norm": 0.5136047601699829,
"learning_rate": 8e-05,
"loss": 1.6556,
"step": 1235
},
{
"epoch": 0.25669781931464175,
"grad_norm": 0.4959682822227478,
"learning_rate": 8e-05,
"loss": 1.6347,
"step": 1236
},
{
"epoch": 0.2569055036344756,
"grad_norm": 0.5049384236335754,
"learning_rate": 8e-05,
"loss": 1.6096,
"step": 1237
},
{
"epoch": 0.25711318795430943,
"grad_norm": 0.5177755951881409,
"learning_rate": 8e-05,
"loss": 1.5408,
"step": 1238
},
{
"epoch": 0.2573208722741433,
"grad_norm": 0.4715554416179657,
"learning_rate": 8e-05,
"loss": 1.6184,
"step": 1239
},
{
"epoch": 0.25752855659397716,
"grad_norm": 0.5331630110740662,
"learning_rate": 8e-05,
"loss": 1.6457,
"step": 1240
},
{
"epoch": 0.25773624091381103,
"grad_norm": 0.48646849393844604,
"learning_rate": 8e-05,
"loss": 1.6077,
"step": 1241
},
{
"epoch": 0.25794392523364484,
"grad_norm": 0.48699328303337097,
"learning_rate": 8e-05,
"loss": 1.5695,
"step": 1242
},
{
"epoch": 0.2581516095534787,
"grad_norm": 0.4788655936717987,
"learning_rate": 8e-05,
"loss": 1.6047,
"step": 1243
},
{
"epoch": 0.2583592938733126,
"grad_norm": 0.49768000841140747,
"learning_rate": 8e-05,
"loss": 1.6321,
"step": 1244
},
{
"epoch": 0.2585669781931464,
"grad_norm": 0.4728749096393585,
"learning_rate": 8e-05,
"loss": 1.4864,
"step": 1245
},
{
"epoch": 0.25877466251298026,
"grad_norm": 0.4823184311389923,
"learning_rate": 8e-05,
"loss": 1.5463,
"step": 1246
},
{
"epoch": 0.2589823468328141,
"grad_norm": 0.49181875586509705,
"learning_rate": 8e-05,
"loss": 1.6372,
"step": 1247
},
{
"epoch": 0.259190031152648,
"grad_norm": 0.48405319452285767,
"learning_rate": 8e-05,
"loss": 1.598,
"step": 1248
},
{
"epoch": 0.2593977154724818,
"grad_norm": 0.5048245787620544,
"learning_rate": 8e-05,
"loss": 1.5521,
"step": 1249
},
{
"epoch": 0.25960539979231567,
"grad_norm": 0.4842333495616913,
"learning_rate": 8e-05,
"loss": 1.6642,
"step": 1250
},
{
"epoch": 0.25981308411214954,
"grad_norm": 0.48424315452575684,
"learning_rate": 8e-05,
"loss": 1.6036,
"step": 1251
},
{
"epoch": 0.2600207684319834,
"grad_norm": 0.46493804454803467,
"learning_rate": 8e-05,
"loss": 1.6679,
"step": 1252
},
{
"epoch": 0.2602284527518172,
"grad_norm": 0.5107417106628418,
"learning_rate": 8e-05,
"loss": 1.6116,
"step": 1253
},
{
"epoch": 0.2604361370716511,
"grad_norm": 0.48230957984924316,
"learning_rate": 8e-05,
"loss": 1.5848,
"step": 1254
},
{
"epoch": 0.26064382139148495,
"grad_norm": 0.4716993570327759,
"learning_rate": 8e-05,
"loss": 1.5599,
"step": 1255
},
{
"epoch": 0.2608515057113188,
"grad_norm": 0.4995378255844116,
"learning_rate": 8e-05,
"loss": 1.5162,
"step": 1256
},
{
"epoch": 0.26105919003115263,
"grad_norm": 0.49914610385894775,
"learning_rate": 8e-05,
"loss": 1.5889,
"step": 1257
},
{
"epoch": 0.2612668743509865,
"grad_norm": 0.5149269104003906,
"learning_rate": 8e-05,
"loss": 1.5693,
"step": 1258
},
{
"epoch": 0.26147455867082037,
"grad_norm": 0.4664236903190613,
"learning_rate": 8e-05,
"loss": 1.5911,
"step": 1259
},
{
"epoch": 0.2616822429906542,
"grad_norm": 0.4691005349159241,
"learning_rate": 8e-05,
"loss": 1.5378,
"step": 1260
},
{
"epoch": 0.26188992731048805,
"grad_norm": 0.5125289559364319,
"learning_rate": 8e-05,
"loss": 1.6207,
"step": 1261
},
{
"epoch": 0.2620976116303219,
"grad_norm": 0.5016699433326721,
"learning_rate": 8e-05,
"loss": 1.7096,
"step": 1262
},
{
"epoch": 0.2623052959501558,
"grad_norm": 0.4931148290634155,
"learning_rate": 8e-05,
"loss": 1.535,
"step": 1263
},
{
"epoch": 0.2625129802699896,
"grad_norm": 0.4818016588687897,
"learning_rate": 8e-05,
"loss": 1.6163,
"step": 1264
},
{
"epoch": 0.26272066458982346,
"grad_norm": 0.4864917993545532,
"learning_rate": 8e-05,
"loss": 1.5272,
"step": 1265
},
{
"epoch": 0.26292834890965733,
"grad_norm": 0.4974689483642578,
"learning_rate": 8e-05,
"loss": 1.6396,
"step": 1266
},
{
"epoch": 0.2631360332294912,
"grad_norm": 0.48342207074165344,
"learning_rate": 8e-05,
"loss": 1.5803,
"step": 1267
},
{
"epoch": 0.263343717549325,
"grad_norm": 0.5663976073265076,
"learning_rate": 8e-05,
"loss": 1.6336,
"step": 1268
},
{
"epoch": 0.2635514018691589,
"grad_norm": 0.5170464515686035,
"learning_rate": 8e-05,
"loss": 1.6435,
"step": 1269
},
{
"epoch": 0.26375908618899274,
"grad_norm": 0.49931710958480835,
"learning_rate": 8e-05,
"loss": 1.697,
"step": 1270
},
{
"epoch": 0.2639667705088266,
"grad_norm": 0.4786977767944336,
"learning_rate": 8e-05,
"loss": 1.5923,
"step": 1271
},
{
"epoch": 0.2641744548286604,
"grad_norm": 0.490163117647171,
"learning_rate": 8e-05,
"loss": 1.5944,
"step": 1272
},
{
"epoch": 0.2643821391484943,
"grad_norm": 0.469289630651474,
"learning_rate": 8e-05,
"loss": 1.4951,
"step": 1273
},
{
"epoch": 0.26458982346832816,
"grad_norm": 0.49278539419174194,
"learning_rate": 8e-05,
"loss": 1.5941,
"step": 1274
},
{
"epoch": 0.26479750778816197,
"grad_norm": 0.4842006266117096,
"learning_rate": 8e-05,
"loss": 1.5923,
"step": 1275
},
{
"epoch": 0.26500519210799584,
"grad_norm": 0.5030384659767151,
"learning_rate": 8e-05,
"loss": 1.6361,
"step": 1276
},
{
"epoch": 0.2652128764278297,
"grad_norm": 0.4949890971183777,
"learning_rate": 8e-05,
"loss": 1.6389,
"step": 1277
},
{
"epoch": 0.26542056074766357,
"grad_norm": 0.4904261827468872,
"learning_rate": 8e-05,
"loss": 1.5555,
"step": 1278
},
{
"epoch": 0.2656282450674974,
"grad_norm": 0.48278945684432983,
"learning_rate": 8e-05,
"loss": 1.64,
"step": 1279
},
{
"epoch": 0.26583592938733125,
"grad_norm": 0.48540300130844116,
"learning_rate": 8e-05,
"loss": 1.6054,
"step": 1280
},
{
"epoch": 0.2660436137071651,
"grad_norm": 0.5015822649002075,
"learning_rate": 8e-05,
"loss": 1.5868,
"step": 1281
},
{
"epoch": 0.266251298026999,
"grad_norm": 0.49217545986175537,
"learning_rate": 8e-05,
"loss": 1.6145,
"step": 1282
},
{
"epoch": 0.2664589823468328,
"grad_norm": 0.5260308384895325,
"learning_rate": 8e-05,
"loss": 1.6333,
"step": 1283
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.5329016447067261,
"learning_rate": 8e-05,
"loss": 1.7274,
"step": 1284
},
{
"epoch": 0.26687435098650053,
"grad_norm": 0.5138272047042847,
"learning_rate": 8e-05,
"loss": 1.6601,
"step": 1285
},
{
"epoch": 0.2670820353063344,
"grad_norm": 0.4883299171924591,
"learning_rate": 8e-05,
"loss": 1.5988,
"step": 1286
},
{
"epoch": 0.2672897196261682,
"grad_norm": 0.5150203108787537,
"learning_rate": 8e-05,
"loss": 1.6732,
"step": 1287
},
{
"epoch": 0.2674974039460021,
"grad_norm": 0.49123409390449524,
"learning_rate": 8e-05,
"loss": 1.5287,
"step": 1288
},
{
"epoch": 0.26770508826583594,
"grad_norm": 0.4945281744003296,
"learning_rate": 8e-05,
"loss": 1.6321,
"step": 1289
},
{
"epoch": 0.26791277258566976,
"grad_norm": 0.5029784440994263,
"learning_rate": 8e-05,
"loss": 1.6214,
"step": 1290
},
{
"epoch": 0.2681204569055036,
"grad_norm": 0.46249303221702576,
"learning_rate": 8e-05,
"loss": 1.4891,
"step": 1291
},
{
"epoch": 0.2683281412253375,
"grad_norm": 0.5195319652557373,
"learning_rate": 8e-05,
"loss": 1.6059,
"step": 1292
},
{
"epoch": 0.26853582554517136,
"grad_norm": 0.5203794836997986,
"learning_rate": 8e-05,
"loss": 1.6673,
"step": 1293
},
{
"epoch": 0.26874350986500517,
"grad_norm": 0.4765598773956299,
"learning_rate": 8e-05,
"loss": 1.6292,
"step": 1294
},
{
"epoch": 0.26895119418483904,
"grad_norm": 0.4790004789829254,
"learning_rate": 8e-05,
"loss": 1.5275,
"step": 1295
},
{
"epoch": 0.2691588785046729,
"grad_norm": 0.486456036567688,
"learning_rate": 8e-05,
"loss": 1.5916,
"step": 1296
},
{
"epoch": 0.2693665628245068,
"grad_norm": 0.4780852496623993,
"learning_rate": 8e-05,
"loss": 1.6143,
"step": 1297
},
{
"epoch": 0.2695742471443406,
"grad_norm": 0.5063662528991699,
"learning_rate": 8e-05,
"loss": 1.598,
"step": 1298
},
{
"epoch": 0.26978193146417445,
"grad_norm": 0.4856896996498108,
"learning_rate": 8e-05,
"loss": 1.6217,
"step": 1299
},
{
"epoch": 0.2699896157840083,
"grad_norm": 0.4855417013168335,
"learning_rate": 8e-05,
"loss": 1.5746,
"step": 1300
},
{
"epoch": 0.2701973001038422,
"grad_norm": 0.4749685227870941,
"learning_rate": 8e-05,
"loss": 1.529,
"step": 1301
},
{
"epoch": 0.270404984423676,
"grad_norm": 0.4671476185321808,
"learning_rate": 8e-05,
"loss": 1.5717,
"step": 1302
},
{
"epoch": 0.27061266874350987,
"grad_norm": 0.49062880873680115,
"learning_rate": 8e-05,
"loss": 1.5799,
"step": 1303
},
{
"epoch": 0.27082035306334373,
"grad_norm": 0.4858238995075226,
"learning_rate": 8e-05,
"loss": 1.6249,
"step": 1304
},
{
"epoch": 0.27102803738317754,
"grad_norm": 0.4788179397583008,
"learning_rate": 8e-05,
"loss": 1.5742,
"step": 1305
},
{
"epoch": 0.2712357217030114,
"grad_norm": 0.5083610415458679,
"learning_rate": 8e-05,
"loss": 1.5634,
"step": 1306
},
{
"epoch": 0.2714434060228453,
"grad_norm": 0.4782449007034302,
"learning_rate": 8e-05,
"loss": 1.593,
"step": 1307
},
{
"epoch": 0.27165109034267915,
"grad_norm": 0.5111173391342163,
"learning_rate": 8e-05,
"loss": 1.5827,
"step": 1308
},
{
"epoch": 0.27185877466251296,
"grad_norm": 0.4896087646484375,
"learning_rate": 8e-05,
"loss": 1.5713,
"step": 1309
},
{
"epoch": 0.2720664589823468,
"grad_norm": 0.50101238489151,
"learning_rate": 8e-05,
"loss": 1.5708,
"step": 1310
},
{
"epoch": 0.2722741433021807,
"grad_norm": 0.4787428677082062,
"learning_rate": 8e-05,
"loss": 1.53,
"step": 1311
},
{
"epoch": 0.27248182762201456,
"grad_norm": 0.4917222857475281,
"learning_rate": 8e-05,
"loss": 1.6704,
"step": 1312
},
{
"epoch": 0.2726895119418484,
"grad_norm": 0.4935707449913025,
"learning_rate": 8e-05,
"loss": 1.5838,
"step": 1313
},
{
"epoch": 0.27289719626168224,
"grad_norm": 0.48440101742744446,
"learning_rate": 8e-05,
"loss": 1.6517,
"step": 1314
},
{
"epoch": 0.2731048805815161,
"grad_norm": 0.47382527589797974,
"learning_rate": 8e-05,
"loss": 1.5884,
"step": 1315
},
{
"epoch": 0.27331256490135,
"grad_norm": 0.5190213322639465,
"learning_rate": 8e-05,
"loss": 1.6797,
"step": 1316
},
{
"epoch": 0.2735202492211838,
"grad_norm": 0.49640318751335144,
"learning_rate": 8e-05,
"loss": 1.6594,
"step": 1317
},
{
"epoch": 0.27372793354101765,
"grad_norm": 0.5151790380477905,
"learning_rate": 8e-05,
"loss": 1.5557,
"step": 1318
},
{
"epoch": 0.2739356178608515,
"grad_norm": 0.47358784079551697,
"learning_rate": 8e-05,
"loss": 1.5639,
"step": 1319
},
{
"epoch": 0.27414330218068533,
"grad_norm": 0.5438617467880249,
"learning_rate": 8e-05,
"loss": 1.6362,
"step": 1320
},
{
"epoch": 0.2743509865005192,
"grad_norm": 0.4804224967956543,
"learning_rate": 8e-05,
"loss": 1.5946,
"step": 1321
},
{
"epoch": 0.27455867082035307,
"grad_norm": 0.5258082151412964,
"learning_rate": 8e-05,
"loss": 1.5561,
"step": 1322
},
{
"epoch": 0.27476635514018694,
"grad_norm": 0.48843201994895935,
"learning_rate": 8e-05,
"loss": 1.5366,
"step": 1323
},
{
"epoch": 0.27497403946002075,
"grad_norm": 0.47955572605133057,
"learning_rate": 8e-05,
"loss": 1.6056,
"step": 1324
},
{
"epoch": 0.2751817237798546,
"grad_norm": 0.4964466392993927,
"learning_rate": 8e-05,
"loss": 1.6292,
"step": 1325
},
{
"epoch": 0.2753894080996885,
"grad_norm": 0.4776870608329773,
"learning_rate": 8e-05,
"loss": 1.6147,
"step": 1326
},
{
"epoch": 0.27559709241952235,
"grad_norm": 0.49412986636161804,
"learning_rate": 8e-05,
"loss": 1.596,
"step": 1327
},
{
"epoch": 0.27580477673935616,
"grad_norm": 0.49872222542762756,
"learning_rate": 8e-05,
"loss": 1.622,
"step": 1328
},
{
"epoch": 0.27601246105919003,
"grad_norm": 0.48729175329208374,
"learning_rate": 8e-05,
"loss": 1.6105,
"step": 1329
},
{
"epoch": 0.2762201453790239,
"grad_norm": 0.4850686490535736,
"learning_rate": 8e-05,
"loss": 1.5978,
"step": 1330
},
{
"epoch": 0.27642782969885776,
"grad_norm": 0.49125537276268005,
"learning_rate": 8e-05,
"loss": 1.5584,
"step": 1331
},
{
"epoch": 0.2766355140186916,
"grad_norm": 0.5107133984565735,
"learning_rate": 8e-05,
"loss": 1.6258,
"step": 1332
},
{
"epoch": 0.27684319833852544,
"grad_norm": 0.4897415339946747,
"learning_rate": 8e-05,
"loss": 1.6198,
"step": 1333
},
{
"epoch": 0.2770508826583593,
"grad_norm": 0.4734158217906952,
"learning_rate": 8e-05,
"loss": 1.5732,
"step": 1334
},
{
"epoch": 0.2772585669781931,
"grad_norm": 0.5000379681587219,
"learning_rate": 8e-05,
"loss": 1.6496,
"step": 1335
},
{
"epoch": 0.277466251298027,
"grad_norm": 0.5036786198616028,
"learning_rate": 8e-05,
"loss": 1.6218,
"step": 1336
},
{
"epoch": 0.27767393561786086,
"grad_norm": 0.4840192198753357,
"learning_rate": 8e-05,
"loss": 1.5044,
"step": 1337
},
{
"epoch": 0.2778816199376947,
"grad_norm": 0.4741503894329071,
"learning_rate": 8e-05,
"loss": 1.5801,
"step": 1338
},
{
"epoch": 0.27808930425752854,
"grad_norm": 0.488294392824173,
"learning_rate": 8e-05,
"loss": 1.597,
"step": 1339
},
{
"epoch": 0.2782969885773624,
"grad_norm": 0.5086274147033691,
"learning_rate": 8e-05,
"loss": 1.6069,
"step": 1340
},
{
"epoch": 0.27850467289719627,
"grad_norm": 0.510127604007721,
"learning_rate": 8e-05,
"loss": 1.596,
"step": 1341
},
{
"epoch": 0.27871235721703014,
"grad_norm": 0.46870115399360657,
"learning_rate": 8e-05,
"loss": 1.5364,
"step": 1342
},
{
"epoch": 0.27892004153686395,
"grad_norm": 0.45464587211608887,
"learning_rate": 8e-05,
"loss": 1.5018,
"step": 1343
},
{
"epoch": 0.2791277258566978,
"grad_norm": 0.48707786202430725,
"learning_rate": 8e-05,
"loss": 1.5411,
"step": 1344
},
{
"epoch": 0.2793354101765317,
"grad_norm": 0.4926038086414337,
"learning_rate": 8e-05,
"loss": 1.6213,
"step": 1345
},
{
"epoch": 0.27954309449636555,
"grad_norm": 0.48424461483955383,
"learning_rate": 8e-05,
"loss": 1.6463,
"step": 1346
},
{
"epoch": 0.27975077881619936,
"grad_norm": 0.48776450753211975,
"learning_rate": 8e-05,
"loss": 1.6108,
"step": 1347
},
{
"epoch": 0.27995846313603323,
"grad_norm": 0.4794577360153198,
"learning_rate": 8e-05,
"loss": 1.5981,
"step": 1348
},
{
"epoch": 0.2801661474558671,
"grad_norm": 0.4882426857948303,
"learning_rate": 8e-05,
"loss": 1.6774,
"step": 1349
},
{
"epoch": 0.2803738317757009,
"grad_norm": 0.5013483166694641,
"learning_rate": 8e-05,
"loss": 1.6659,
"step": 1350
},
{
"epoch": 0.2805815160955348,
"grad_norm": 0.4758848547935486,
"learning_rate": 8e-05,
"loss": 1.5826,
"step": 1351
},
{
"epoch": 0.28078920041536864,
"grad_norm": 0.4662100374698639,
"learning_rate": 8e-05,
"loss": 1.5151,
"step": 1352
},
{
"epoch": 0.2809968847352025,
"grad_norm": 0.47722524404525757,
"learning_rate": 8e-05,
"loss": 1.5534,
"step": 1353
},
{
"epoch": 0.2812045690550363,
"grad_norm": 0.5073485374450684,
"learning_rate": 8e-05,
"loss": 1.6202,
"step": 1354
},
{
"epoch": 0.2814122533748702,
"grad_norm": 0.49153366684913635,
"learning_rate": 8e-05,
"loss": 1.5921,
"step": 1355
},
{
"epoch": 0.28161993769470406,
"grad_norm": 0.4862796664237976,
"learning_rate": 8e-05,
"loss": 1.5981,
"step": 1356
},
{
"epoch": 0.2818276220145379,
"grad_norm": 0.49322280287742615,
"learning_rate": 8e-05,
"loss": 1.6894,
"step": 1357
},
{
"epoch": 0.28203530633437174,
"grad_norm": 0.48775988817214966,
"learning_rate": 8e-05,
"loss": 1.6123,
"step": 1358
},
{
"epoch": 0.2822429906542056,
"grad_norm": 0.47947344183921814,
"learning_rate": 8e-05,
"loss": 1.5709,
"step": 1359
},
{
"epoch": 0.2824506749740395,
"grad_norm": 0.5054206848144531,
"learning_rate": 8e-05,
"loss": 1.6541,
"step": 1360
},
{
"epoch": 0.28265835929387334,
"grad_norm": 0.4824366569519043,
"learning_rate": 8e-05,
"loss": 1.5929,
"step": 1361
},
{
"epoch": 0.28286604361370715,
"grad_norm": 0.49978917837142944,
"learning_rate": 8e-05,
"loss": 1.6151,
"step": 1362
},
{
"epoch": 0.283073727933541,
"grad_norm": 0.4934566915035248,
"learning_rate": 8e-05,
"loss": 1.6641,
"step": 1363
},
{
"epoch": 0.2832814122533749,
"grad_norm": 0.4824499785900116,
"learning_rate": 8e-05,
"loss": 1.6189,
"step": 1364
},
{
"epoch": 0.2834890965732087,
"grad_norm": 0.47832030057907104,
"learning_rate": 8e-05,
"loss": 1.5083,
"step": 1365
},
{
"epoch": 0.28369678089304257,
"grad_norm": 0.4860023260116577,
"learning_rate": 8e-05,
"loss": 1.565,
"step": 1366
},
{
"epoch": 0.28390446521287643,
"grad_norm": 0.48774632811546326,
"learning_rate": 8e-05,
"loss": 1.638,
"step": 1367
},
{
"epoch": 0.2841121495327103,
"grad_norm": 0.47733739018440247,
"learning_rate": 8e-05,
"loss": 1.6151,
"step": 1368
},
{
"epoch": 0.2843198338525441,
"grad_norm": 0.48727935552597046,
"learning_rate": 8e-05,
"loss": 1.605,
"step": 1369
},
{
"epoch": 0.284527518172378,
"grad_norm": 0.4770117998123169,
"learning_rate": 8e-05,
"loss": 1.6759,
"step": 1370
},
{
"epoch": 0.28473520249221185,
"grad_norm": 0.49704787135124207,
"learning_rate": 8e-05,
"loss": 1.6116,
"step": 1371
},
{
"epoch": 0.2849428868120457,
"grad_norm": 0.48451000452041626,
"learning_rate": 8e-05,
"loss": 1.5776,
"step": 1372
},
{
"epoch": 0.2851505711318795,
"grad_norm": 0.49385419487953186,
"learning_rate": 8e-05,
"loss": 1.6142,
"step": 1373
},
{
"epoch": 0.2853582554517134,
"grad_norm": 0.49187561869621277,
"learning_rate": 8e-05,
"loss": 1.6076,
"step": 1374
},
{
"epoch": 0.28556593977154726,
"grad_norm": 0.4695932865142822,
"learning_rate": 8e-05,
"loss": 1.5801,
"step": 1375
},
{
"epoch": 0.2857736240913811,
"grad_norm": 0.49953845143318176,
"learning_rate": 8e-05,
"loss": 1.617,
"step": 1376
},
{
"epoch": 0.28598130841121494,
"grad_norm": 0.49016183614730835,
"learning_rate": 8e-05,
"loss": 1.6751,
"step": 1377
},
{
"epoch": 0.2861889927310488,
"grad_norm": 0.5039032101631165,
"learning_rate": 8e-05,
"loss": 1.5667,
"step": 1378
},
{
"epoch": 0.2863966770508827,
"grad_norm": 0.5153660178184509,
"learning_rate": 8e-05,
"loss": 1.5351,
"step": 1379
},
{
"epoch": 0.2866043613707165,
"grad_norm": 0.4923986792564392,
"learning_rate": 8e-05,
"loss": 1.6022,
"step": 1380
},
{
"epoch": 0.28681204569055035,
"grad_norm": 0.4953305125236511,
"learning_rate": 8e-05,
"loss": 1.6172,
"step": 1381
},
{
"epoch": 0.2870197300103842,
"grad_norm": 0.48334407806396484,
"learning_rate": 8e-05,
"loss": 1.5416,
"step": 1382
},
{
"epoch": 0.2872274143302181,
"grad_norm": 0.4850659966468811,
"learning_rate": 8e-05,
"loss": 1.6704,
"step": 1383
},
{
"epoch": 0.2874350986500519,
"grad_norm": 0.5282303690910339,
"learning_rate": 8e-05,
"loss": 1.6134,
"step": 1384
},
{
"epoch": 0.28764278296988577,
"grad_norm": 0.4880530834197998,
"learning_rate": 8e-05,
"loss": 1.6108,
"step": 1385
},
{
"epoch": 0.28785046728971964,
"grad_norm": 0.5376881957054138,
"learning_rate": 8e-05,
"loss": 1.6852,
"step": 1386
},
{
"epoch": 0.2880581516095535,
"grad_norm": 0.5560894012451172,
"learning_rate": 8e-05,
"loss": 1.677,
"step": 1387
},
{
"epoch": 0.2882658359293873,
"grad_norm": 0.5074867606163025,
"learning_rate": 8e-05,
"loss": 1.5932,
"step": 1388
},
{
"epoch": 0.2884735202492212,
"grad_norm": 0.4841192066669464,
"learning_rate": 8e-05,
"loss": 1.5258,
"step": 1389
},
{
"epoch": 0.28868120456905505,
"grad_norm": 0.5013852715492249,
"learning_rate": 8e-05,
"loss": 1.5784,
"step": 1390
},
{
"epoch": 0.28888888888888886,
"grad_norm": 0.494810551404953,
"learning_rate": 8e-05,
"loss": 1.5967,
"step": 1391
},
{
"epoch": 0.28909657320872273,
"grad_norm": 0.49166232347488403,
"learning_rate": 8e-05,
"loss": 1.5865,
"step": 1392
},
{
"epoch": 0.2893042575285566,
"grad_norm": 0.5110693573951721,
"learning_rate": 8e-05,
"loss": 1.5969,
"step": 1393
},
{
"epoch": 0.28951194184839046,
"grad_norm": 0.542017936706543,
"learning_rate": 8e-05,
"loss": 1.6099,
"step": 1394
},
{
"epoch": 0.2897196261682243,
"grad_norm": 0.5205735564231873,
"learning_rate": 8e-05,
"loss": 1.6194,
"step": 1395
},
{
"epoch": 0.28992731048805814,
"grad_norm": 0.542843222618103,
"learning_rate": 8e-05,
"loss": 1.5778,
"step": 1396
},
{
"epoch": 0.290134994807892,
"grad_norm": 0.5020806789398193,
"learning_rate": 8e-05,
"loss": 1.6645,
"step": 1397
},
{
"epoch": 0.2903426791277259,
"grad_norm": 0.48713013529777527,
"learning_rate": 8e-05,
"loss": 1.5229,
"step": 1398
},
{
"epoch": 0.2905503634475597,
"grad_norm": 0.49321314692497253,
"learning_rate": 8e-05,
"loss": 1.6144,
"step": 1399
},
{
"epoch": 0.29075804776739356,
"grad_norm": 0.4920049011707306,
"learning_rate": 8e-05,
"loss": 1.6081,
"step": 1400
},
{
"epoch": 0.2909657320872274,
"grad_norm": 0.4814404249191284,
"learning_rate": 8e-05,
"loss": 1.5424,
"step": 1401
},
{
"epoch": 0.2911734164070613,
"grad_norm": 0.48672202229499817,
"learning_rate": 8e-05,
"loss": 1.5617,
"step": 1402
},
{
"epoch": 0.2913811007268951,
"grad_norm": 0.4993645250797272,
"learning_rate": 8e-05,
"loss": 1.5552,
"step": 1403
},
{
"epoch": 0.29158878504672897,
"grad_norm": 0.493221640586853,
"learning_rate": 8e-05,
"loss": 1.6083,
"step": 1404
},
{
"epoch": 0.29179646936656284,
"grad_norm": 0.5242977738380432,
"learning_rate": 8e-05,
"loss": 1.6794,
"step": 1405
},
{
"epoch": 0.29200415368639665,
"grad_norm": 0.5010468363761902,
"learning_rate": 8e-05,
"loss": 1.6118,
"step": 1406
},
{
"epoch": 0.2922118380062305,
"grad_norm": 0.4755447506904602,
"learning_rate": 8e-05,
"loss": 1.5602,
"step": 1407
},
{
"epoch": 0.2924195223260644,
"grad_norm": 0.48869359493255615,
"learning_rate": 8e-05,
"loss": 1.6312,
"step": 1408
},
{
"epoch": 0.29262720664589825,
"grad_norm": 0.505214512348175,
"learning_rate": 8e-05,
"loss": 1.7103,
"step": 1409
},
{
"epoch": 0.29283489096573206,
"grad_norm": 0.4835333526134491,
"learning_rate": 8e-05,
"loss": 1.5842,
"step": 1410
},
{
"epoch": 0.29304257528556593,
"grad_norm": 0.47053441405296326,
"learning_rate": 8e-05,
"loss": 1.5655,
"step": 1411
},
{
"epoch": 0.2932502596053998,
"grad_norm": 0.4897231161594391,
"learning_rate": 8e-05,
"loss": 1.5427,
"step": 1412
},
{
"epoch": 0.29345794392523367,
"grad_norm": 0.4817632734775543,
"learning_rate": 8e-05,
"loss": 1.5703,
"step": 1413
},
{
"epoch": 0.2936656282450675,
"grad_norm": 0.5053437352180481,
"learning_rate": 8e-05,
"loss": 1.5884,
"step": 1414
},
{
"epoch": 0.29387331256490135,
"grad_norm": 0.48642995953559875,
"learning_rate": 8e-05,
"loss": 1.5685,
"step": 1415
},
{
"epoch": 0.2940809968847352,
"grad_norm": 0.4815353453159332,
"learning_rate": 8e-05,
"loss": 1.6158,
"step": 1416
},
{
"epoch": 0.2942886812045691,
"grad_norm": 0.49331390857696533,
"learning_rate": 8e-05,
"loss": 1.6346,
"step": 1417
},
{
"epoch": 0.2944963655244029,
"grad_norm": 0.5165848135948181,
"learning_rate": 8e-05,
"loss": 1.6028,
"step": 1418
},
{
"epoch": 0.29470404984423676,
"grad_norm": 0.4790689945220947,
"learning_rate": 8e-05,
"loss": 1.6281,
"step": 1419
},
{
"epoch": 0.2949117341640706,
"grad_norm": 0.46984192728996277,
"learning_rate": 8e-05,
"loss": 1.5487,
"step": 1420
},
{
"epoch": 0.29511941848390444,
"grad_norm": 0.5013918876647949,
"learning_rate": 8e-05,
"loss": 1.5629,
"step": 1421
},
{
"epoch": 0.2953271028037383,
"grad_norm": 0.5373260974884033,
"learning_rate": 8e-05,
"loss": 1.6564,
"step": 1422
},
{
"epoch": 0.2955347871235722,
"grad_norm": 0.5395689606666565,
"learning_rate": 8e-05,
"loss": 1.6307,
"step": 1423
},
{
"epoch": 0.29574247144340604,
"grad_norm": 0.5050875544548035,
"learning_rate": 8e-05,
"loss": 1.5577,
"step": 1424
},
{
"epoch": 0.29595015576323985,
"grad_norm": 0.46294352412223816,
"learning_rate": 8e-05,
"loss": 1.5705,
"step": 1425
},
{
"epoch": 0.2961578400830737,
"grad_norm": 0.4974973201751709,
"learning_rate": 8e-05,
"loss": 1.5908,
"step": 1426
},
{
"epoch": 0.2963655244029076,
"grad_norm": 0.5147601366043091,
"learning_rate": 8e-05,
"loss": 1.5833,
"step": 1427
},
{
"epoch": 0.29657320872274145,
"grad_norm": 0.5149271488189697,
"learning_rate": 8e-05,
"loss": 1.6029,
"step": 1428
},
{
"epoch": 0.29678089304257527,
"grad_norm": 0.48388561606407166,
"learning_rate": 8e-05,
"loss": 1.5629,
"step": 1429
},
{
"epoch": 0.29698857736240913,
"grad_norm": 0.4807972013950348,
"learning_rate": 8e-05,
"loss": 1.5618,
"step": 1430
},
{
"epoch": 0.297196261682243,
"grad_norm": 0.4944983124732971,
"learning_rate": 8e-05,
"loss": 1.6518,
"step": 1431
},
{
"epoch": 0.29740394600207687,
"grad_norm": 0.467385470867157,
"learning_rate": 8e-05,
"loss": 1.5768,
"step": 1432
},
{
"epoch": 0.2976116303219107,
"grad_norm": 0.4817611277103424,
"learning_rate": 8e-05,
"loss": 1.559,
"step": 1433
},
{
"epoch": 0.29781931464174455,
"grad_norm": 0.4777863919734955,
"learning_rate": 8e-05,
"loss": 1.5712,
"step": 1434
},
{
"epoch": 0.2980269989615784,
"grad_norm": 0.46716588735580444,
"learning_rate": 8e-05,
"loss": 1.5301,
"step": 1435
},
{
"epoch": 0.2982346832814122,
"grad_norm": 0.49335551261901855,
"learning_rate": 8e-05,
"loss": 1.633,
"step": 1436
},
{
"epoch": 0.2984423676012461,
"grad_norm": 0.47975221276283264,
"learning_rate": 8e-05,
"loss": 1.5948,
"step": 1437
},
{
"epoch": 0.29865005192107996,
"grad_norm": 0.4699002504348755,
"learning_rate": 8e-05,
"loss": 1.5368,
"step": 1438
},
{
"epoch": 0.29885773624091383,
"grad_norm": 0.5026534199714661,
"learning_rate": 8e-05,
"loss": 1.6412,
"step": 1439
},
{
"epoch": 0.29906542056074764,
"grad_norm": 0.4809010326862335,
"learning_rate": 8e-05,
"loss": 1.5511,
"step": 1440
},
{
"epoch": 0.2992731048805815,
"grad_norm": 0.5056330561637878,
"learning_rate": 8e-05,
"loss": 1.6516,
"step": 1441
},
{
"epoch": 0.2994807892004154,
"grad_norm": 0.49161332845687866,
"learning_rate": 8e-05,
"loss": 1.5964,
"step": 1442
},
{
"epoch": 0.29968847352024924,
"grad_norm": 0.49896249175071716,
"learning_rate": 8e-05,
"loss": 1.644,
"step": 1443
},
{
"epoch": 0.29989615784008306,
"grad_norm": 0.4822237491607666,
"learning_rate": 8e-05,
"loss": 1.6416,
"step": 1444
},
{
"epoch": 0.3001038421599169,
"grad_norm": 0.48241591453552246,
"learning_rate": 8e-05,
"loss": 1.6328,
"step": 1445
},
{
"epoch": 0.3003115264797508,
"grad_norm": 0.48107171058654785,
"learning_rate": 8e-05,
"loss": 1.6912,
"step": 1446
},
{
"epoch": 0.30051921079958466,
"grad_norm": 0.4950486421585083,
"learning_rate": 8e-05,
"loss": 1.5809,
"step": 1447
},
{
"epoch": 0.30072689511941847,
"grad_norm": 0.516384482383728,
"learning_rate": 8e-05,
"loss": 1.5433,
"step": 1448
},
{
"epoch": 0.30093457943925234,
"grad_norm": 0.47670841217041016,
"learning_rate": 8e-05,
"loss": 1.5641,
"step": 1449
},
{
"epoch": 0.3011422637590862,
"grad_norm": 0.4875502288341522,
"learning_rate": 8e-05,
"loss": 1.5905,
"step": 1450
},
{
"epoch": 0.30134994807892,
"grad_norm": 0.4830038547515869,
"learning_rate": 8e-05,
"loss": 1.5338,
"step": 1451
},
{
"epoch": 0.3015576323987539,
"grad_norm": 0.4864535331726074,
"learning_rate": 8e-05,
"loss": 1.5884,
"step": 1452
},
{
"epoch": 0.30176531671858775,
"grad_norm": 0.48559343814849854,
"learning_rate": 8e-05,
"loss": 1.5684,
"step": 1453
},
{
"epoch": 0.3019730010384216,
"grad_norm": 0.4923616051673889,
"learning_rate": 8e-05,
"loss": 1.5975,
"step": 1454
},
{
"epoch": 0.30218068535825543,
"grad_norm": 0.4875079095363617,
"learning_rate": 8e-05,
"loss": 1.5574,
"step": 1455
},
{
"epoch": 0.3023883696780893,
"grad_norm": 0.5041059851646423,
"learning_rate": 8e-05,
"loss": 1.566,
"step": 1456
},
{
"epoch": 0.30259605399792316,
"grad_norm": 0.5048243403434753,
"learning_rate": 8e-05,
"loss": 1.5744,
"step": 1457
},
{
"epoch": 0.30280373831775703,
"grad_norm": 0.48077791929244995,
"learning_rate": 8e-05,
"loss": 1.6088,
"step": 1458
},
{
"epoch": 0.30301142263759084,
"grad_norm": 0.49609705805778503,
"learning_rate": 8e-05,
"loss": 1.6863,
"step": 1459
},
{
"epoch": 0.3032191069574247,
"grad_norm": 0.5017523765563965,
"learning_rate": 8e-05,
"loss": 1.6013,
"step": 1460
},
{
"epoch": 0.3034267912772586,
"grad_norm": 0.4952293634414673,
"learning_rate": 8e-05,
"loss": 1.6157,
"step": 1461
},
{
"epoch": 0.30363447559709245,
"grad_norm": 0.4877483546733856,
"learning_rate": 8e-05,
"loss": 1.5992,
"step": 1462
},
{
"epoch": 0.30384215991692626,
"grad_norm": 0.4839021861553192,
"learning_rate": 8e-05,
"loss": 1.66,
"step": 1463
},
{
"epoch": 0.3040498442367601,
"grad_norm": 0.523611307144165,
"learning_rate": 8e-05,
"loss": 1.6295,
"step": 1464
},
{
"epoch": 0.304257528556594,
"grad_norm": 0.4874498248100281,
"learning_rate": 8e-05,
"loss": 1.5778,
"step": 1465
},
{
"epoch": 0.3044652128764278,
"grad_norm": 0.49205684661865234,
"learning_rate": 8e-05,
"loss": 1.575,
"step": 1466
},
{
"epoch": 0.30467289719626167,
"grad_norm": 0.5001851916313171,
"learning_rate": 8e-05,
"loss": 1.6221,
"step": 1467
},
{
"epoch": 0.30488058151609554,
"grad_norm": 0.5134277939796448,
"learning_rate": 8e-05,
"loss": 1.5791,
"step": 1468
},
{
"epoch": 0.3050882658359294,
"grad_norm": 0.4986041784286499,
"learning_rate": 8e-05,
"loss": 1.6249,
"step": 1469
},
{
"epoch": 0.3052959501557632,
"grad_norm": 0.4865819811820984,
"learning_rate": 8e-05,
"loss": 1.6065,
"step": 1470
},
{
"epoch": 0.3055036344755971,
"grad_norm": 0.48902952671051025,
"learning_rate": 8e-05,
"loss": 1.6383,
"step": 1471
},
{
"epoch": 0.30571131879543095,
"grad_norm": 0.5065701007843018,
"learning_rate": 8e-05,
"loss": 1.5784,
"step": 1472
},
{
"epoch": 0.3059190031152648,
"grad_norm": 0.48749783635139465,
"learning_rate": 8e-05,
"loss": 1.5601,
"step": 1473
},
{
"epoch": 0.30612668743509863,
"grad_norm": 0.517052948474884,
"learning_rate": 8e-05,
"loss": 1.6655,
"step": 1474
},
{
"epoch": 0.3063343717549325,
"grad_norm": 0.4751560389995575,
"learning_rate": 8e-05,
"loss": 1.5892,
"step": 1475
},
{
"epoch": 0.30654205607476637,
"grad_norm": 0.4731171429157257,
"learning_rate": 8e-05,
"loss": 1.5153,
"step": 1476
},
{
"epoch": 0.30674974039460023,
"grad_norm": 0.5009525418281555,
"learning_rate": 8e-05,
"loss": 1.6547,
"step": 1477
},
{
"epoch": 0.30695742471443405,
"grad_norm": 0.4930509626865387,
"learning_rate": 8e-05,
"loss": 1.513,
"step": 1478
},
{
"epoch": 0.3071651090342679,
"grad_norm": 0.5279929041862488,
"learning_rate": 8e-05,
"loss": 1.6086,
"step": 1479
},
{
"epoch": 0.3073727933541018,
"grad_norm": 0.4923204481601715,
"learning_rate": 8e-05,
"loss": 1.5659,
"step": 1480
},
{
"epoch": 0.3075804776739356,
"grad_norm": 0.4844015836715698,
"learning_rate": 8e-05,
"loss": 1.6044,
"step": 1481
},
{
"epoch": 0.30778816199376946,
"grad_norm": 0.5059119462966919,
"learning_rate": 8e-05,
"loss": 1.61,
"step": 1482
},
{
"epoch": 0.3079958463136033,
"grad_norm": 0.5250138640403748,
"learning_rate": 8e-05,
"loss": 1.5921,
"step": 1483
},
{
"epoch": 0.3082035306334372,
"grad_norm": 0.512847363948822,
"learning_rate": 8e-05,
"loss": 1.6694,
"step": 1484
},
{
"epoch": 0.308411214953271,
"grad_norm": 0.5249167084693909,
"learning_rate": 8e-05,
"loss": 1.6558,
"step": 1485
},
{
"epoch": 0.3086188992731049,
"grad_norm": 0.5241632461547852,
"learning_rate": 8e-05,
"loss": 1.6323,
"step": 1486
},
{
"epoch": 0.30882658359293874,
"grad_norm": 0.49093738198280334,
"learning_rate": 8e-05,
"loss": 1.6032,
"step": 1487
},
{
"epoch": 0.3090342679127726,
"grad_norm": 0.49326688051223755,
"learning_rate": 8e-05,
"loss": 1.5726,
"step": 1488
},
{
"epoch": 0.3092419522326064,
"grad_norm": 0.49778178334236145,
"learning_rate": 8e-05,
"loss": 1.5792,
"step": 1489
},
{
"epoch": 0.3094496365524403,
"grad_norm": 0.49950110912323,
"learning_rate": 8e-05,
"loss": 1.5706,
"step": 1490
},
{
"epoch": 0.30965732087227416,
"grad_norm": 0.5080294013023376,
"learning_rate": 8e-05,
"loss": 1.64,
"step": 1491
},
{
"epoch": 0.309865005192108,
"grad_norm": 0.5099862217903137,
"learning_rate": 8e-05,
"loss": 1.6631,
"step": 1492
},
{
"epoch": 0.31007268951194183,
"grad_norm": 0.49251455068588257,
"learning_rate": 8e-05,
"loss": 1.5711,
"step": 1493
},
{
"epoch": 0.3102803738317757,
"grad_norm": 0.49985072016716003,
"learning_rate": 8e-05,
"loss": 1.5286,
"step": 1494
},
{
"epoch": 0.31048805815160957,
"grad_norm": 0.5015414953231812,
"learning_rate": 8e-05,
"loss": 1.5992,
"step": 1495
},
{
"epoch": 0.3106957424714434,
"grad_norm": 0.5101944804191589,
"learning_rate": 8e-05,
"loss": 1.5845,
"step": 1496
},
{
"epoch": 0.31090342679127725,
"grad_norm": 0.5153470635414124,
"learning_rate": 8e-05,
"loss": 1.7133,
"step": 1497
},
{
"epoch": 0.3111111111111111,
"grad_norm": 0.5057701468467712,
"learning_rate": 8e-05,
"loss": 1.6648,
"step": 1498
},
{
"epoch": 0.311318795430945,
"grad_norm": 0.5079687833786011,
"learning_rate": 8e-05,
"loss": 1.6237,
"step": 1499
},
{
"epoch": 0.3115264797507788,
"grad_norm": 0.475410133600235,
"learning_rate": 8e-05,
"loss": 1.6165,
"step": 1500
},
{
"epoch": 0.31173416407061266,
"grad_norm": 0.5307968854904175,
"learning_rate": 8e-05,
"loss": 1.5859,
"step": 1501
},
{
"epoch": 0.31194184839044653,
"grad_norm": 0.47486138343811035,
"learning_rate": 8e-05,
"loss": 1.5676,
"step": 1502
},
{
"epoch": 0.3121495327102804,
"grad_norm": 0.5097178220748901,
"learning_rate": 8e-05,
"loss": 1.5636,
"step": 1503
},
{
"epoch": 0.3123572170301142,
"grad_norm": 0.4760870039463043,
"learning_rate": 8e-05,
"loss": 1.5721,
"step": 1504
},
{
"epoch": 0.3125649013499481,
"grad_norm": 0.4842042624950409,
"learning_rate": 8e-05,
"loss": 1.6645,
"step": 1505
},
{
"epoch": 0.31277258566978194,
"grad_norm": 0.5192430019378662,
"learning_rate": 8e-05,
"loss": 1.6495,
"step": 1506
},
{
"epoch": 0.3129802699896158,
"grad_norm": 0.4800757169723511,
"learning_rate": 8e-05,
"loss": 1.5741,
"step": 1507
},
{
"epoch": 0.3131879543094496,
"grad_norm": 0.4967311918735504,
"learning_rate": 8e-05,
"loss": 1.6412,
"step": 1508
},
{
"epoch": 0.3133956386292835,
"grad_norm": 0.4981956481933594,
"learning_rate": 8e-05,
"loss": 1.6111,
"step": 1509
},
{
"epoch": 0.31360332294911736,
"grad_norm": 0.5167219042778015,
"learning_rate": 8e-05,
"loss": 1.6406,
"step": 1510
},
{
"epoch": 0.31381100726895117,
"grad_norm": 0.48610803484916687,
"learning_rate": 8e-05,
"loss": 1.5525,
"step": 1511
},
{
"epoch": 0.31401869158878504,
"grad_norm": 0.47807812690734863,
"learning_rate": 8e-05,
"loss": 1.5205,
"step": 1512
},
{
"epoch": 0.3142263759086189,
"grad_norm": 0.5001601576805115,
"learning_rate": 8e-05,
"loss": 1.6265,
"step": 1513
},
{
"epoch": 0.31443406022845277,
"grad_norm": 0.5112875699996948,
"learning_rate": 8e-05,
"loss": 1.6357,
"step": 1514
},
{
"epoch": 0.3146417445482866,
"grad_norm": 0.4997986853122711,
"learning_rate": 8e-05,
"loss": 1.6137,
"step": 1515
},
{
"epoch": 0.31484942886812045,
"grad_norm": 0.4741184115409851,
"learning_rate": 8e-05,
"loss": 1.6266,
"step": 1516
},
{
"epoch": 0.3150571131879543,
"grad_norm": 0.5144311785697937,
"learning_rate": 8e-05,
"loss": 1.5782,
"step": 1517
},
{
"epoch": 0.3152647975077882,
"grad_norm": 0.5066690444946289,
"learning_rate": 8e-05,
"loss": 1.6677,
"step": 1518
},
{
"epoch": 0.315472481827622,
"grad_norm": 0.5042521953582764,
"learning_rate": 8e-05,
"loss": 1.5916,
"step": 1519
},
{
"epoch": 0.31568016614745587,
"grad_norm": 0.4977933466434479,
"learning_rate": 8e-05,
"loss": 1.5574,
"step": 1520
},
{
"epoch": 0.31588785046728973,
"grad_norm": 0.5137097239494324,
"learning_rate": 8e-05,
"loss": 1.6411,
"step": 1521
},
{
"epoch": 0.3160955347871236,
"grad_norm": 0.4740540385246277,
"learning_rate": 8e-05,
"loss": 1.57,
"step": 1522
},
{
"epoch": 0.3163032191069574,
"grad_norm": 0.5093042254447937,
"learning_rate": 8e-05,
"loss": 1.6752,
"step": 1523
},
{
"epoch": 0.3165109034267913,
"grad_norm": 0.47612494230270386,
"learning_rate": 8e-05,
"loss": 1.5722,
"step": 1524
},
{
"epoch": 0.31671858774662515,
"grad_norm": 0.5046824812889099,
"learning_rate": 8e-05,
"loss": 1.649,
"step": 1525
},
{
"epoch": 0.31692627206645896,
"grad_norm": 0.49512314796447754,
"learning_rate": 8e-05,
"loss": 1.5808,
"step": 1526
},
{
"epoch": 0.3171339563862928,
"grad_norm": 0.4888535737991333,
"learning_rate": 8e-05,
"loss": 1.6138,
"step": 1527
},
{
"epoch": 0.3173416407061267,
"grad_norm": 0.5067277550697327,
"learning_rate": 8e-05,
"loss": 1.626,
"step": 1528
},
{
"epoch": 0.31754932502596056,
"grad_norm": 0.47953104972839355,
"learning_rate": 8e-05,
"loss": 1.6125,
"step": 1529
},
{
"epoch": 0.3177570093457944,
"grad_norm": 0.5111708641052246,
"learning_rate": 8e-05,
"loss": 1.5847,
"step": 1530
},
{
"epoch": 0.31796469366562824,
"grad_norm": 0.48254770040512085,
"learning_rate": 8e-05,
"loss": 1.5371,
"step": 1531
},
{
"epoch": 0.3181723779854621,
"grad_norm": 0.6176688075065613,
"learning_rate": 8e-05,
"loss": 1.5441,
"step": 1532
},
{
"epoch": 0.318380062305296,
"grad_norm": 0.4850156307220459,
"learning_rate": 8e-05,
"loss": 1.573,
"step": 1533
},
{
"epoch": 0.3185877466251298,
"grad_norm": 0.4983154237270355,
"learning_rate": 8e-05,
"loss": 1.5298,
"step": 1534
},
{
"epoch": 0.31879543094496365,
"grad_norm": 0.5193361043930054,
"learning_rate": 8e-05,
"loss": 1.6185,
"step": 1535
},
{
"epoch": 0.3190031152647975,
"grad_norm": 0.483345627784729,
"learning_rate": 8e-05,
"loss": 1.5397,
"step": 1536
},
{
"epoch": 0.31921079958463133,
"grad_norm": 0.48317113518714905,
"learning_rate": 8e-05,
"loss": 1.6233,
"step": 1537
},
{
"epoch": 0.3194184839044652,
"grad_norm": 0.4782498776912689,
"learning_rate": 8e-05,
"loss": 1.5813,
"step": 1538
},
{
"epoch": 0.31962616822429907,
"grad_norm": 0.47537708282470703,
"learning_rate": 8e-05,
"loss": 1.5454,
"step": 1539
},
{
"epoch": 0.31983385254413293,
"grad_norm": 0.4868844151496887,
"learning_rate": 8e-05,
"loss": 1.6,
"step": 1540
},
{
"epoch": 0.32004153686396675,
"grad_norm": 0.5081853866577148,
"learning_rate": 8e-05,
"loss": 1.5926,
"step": 1541
},
{
"epoch": 0.3202492211838006,
"grad_norm": 0.48315268754959106,
"learning_rate": 8e-05,
"loss": 1.5898,
"step": 1542
},
{
"epoch": 0.3204569055036345,
"grad_norm": 0.4909741282463074,
"learning_rate": 8e-05,
"loss": 1.5828,
"step": 1543
},
{
"epoch": 0.32066458982346835,
"grad_norm": 0.49861210584640503,
"learning_rate": 8e-05,
"loss": 1.6455,
"step": 1544
},
{
"epoch": 0.32087227414330216,
"grad_norm": 0.478344589471817,
"learning_rate": 8e-05,
"loss": 1.6188,
"step": 1545
},
{
"epoch": 0.32107995846313603,
"grad_norm": 0.5060451030731201,
"learning_rate": 8e-05,
"loss": 1.5827,
"step": 1546
},
{
"epoch": 0.3212876427829699,
"grad_norm": 0.48769256472587585,
"learning_rate": 8e-05,
"loss": 1.6526,
"step": 1547
},
{
"epoch": 0.32149532710280376,
"grad_norm": 0.492779403924942,
"learning_rate": 8e-05,
"loss": 1.6032,
"step": 1548
},
{
"epoch": 0.3217030114226376,
"grad_norm": 0.4987448453903198,
"learning_rate": 8e-05,
"loss": 1.5259,
"step": 1549
},
{
"epoch": 0.32191069574247144,
"grad_norm": 0.48688334226608276,
"learning_rate": 8e-05,
"loss": 1.5775,
"step": 1550
},
{
"epoch": 0.3221183800623053,
"grad_norm": 0.4970977306365967,
"learning_rate": 8e-05,
"loss": 1.6524,
"step": 1551
},
{
"epoch": 0.3223260643821391,
"grad_norm": 0.5189974904060364,
"learning_rate": 8e-05,
"loss": 1.5758,
"step": 1552
},
{
"epoch": 0.322533748701973,
"grad_norm": 0.48815828561782837,
"learning_rate": 8e-05,
"loss": 1.5439,
"step": 1553
},
{
"epoch": 0.32274143302180686,
"grad_norm": 0.4929064214229584,
"learning_rate": 8e-05,
"loss": 1.545,
"step": 1554
},
{
"epoch": 0.3229491173416407,
"grad_norm": 0.48322778940200806,
"learning_rate": 8e-05,
"loss": 1.5601,
"step": 1555
},
{
"epoch": 0.32315680166147454,
"grad_norm": 0.5003882646560669,
"learning_rate": 8e-05,
"loss": 1.5581,
"step": 1556
},
{
"epoch": 0.3233644859813084,
"grad_norm": 0.5089275240898132,
"learning_rate": 8e-05,
"loss": 1.6147,
"step": 1557
},
{
"epoch": 0.32357217030114227,
"grad_norm": 0.48767319321632385,
"learning_rate": 8e-05,
"loss": 1.5953,
"step": 1558
},
{
"epoch": 0.32377985462097614,
"grad_norm": 0.5171458125114441,
"learning_rate": 8e-05,
"loss": 1.6395,
"step": 1559
},
{
"epoch": 0.32398753894080995,
"grad_norm": 0.4896124601364136,
"learning_rate": 8e-05,
"loss": 1.6219,
"step": 1560
},
{
"epoch": 0.3241952232606438,
"grad_norm": 0.4953930974006653,
"learning_rate": 8e-05,
"loss": 1.6007,
"step": 1561
},
{
"epoch": 0.3244029075804777,
"grad_norm": 0.5223313570022583,
"learning_rate": 8e-05,
"loss": 1.6687,
"step": 1562
},
{
"epoch": 0.32461059190031155,
"grad_norm": 0.50156569480896,
"learning_rate": 8e-05,
"loss": 1.6407,
"step": 1563
},
{
"epoch": 0.32481827622014536,
"grad_norm": 0.48758435249328613,
"learning_rate": 8e-05,
"loss": 1.5725,
"step": 1564
},
{
"epoch": 0.32502596053997923,
"grad_norm": 0.49146777391433716,
"learning_rate": 8e-05,
"loss": 1.5366,
"step": 1565
},
{
"epoch": 0.3252336448598131,
"grad_norm": 0.5082824230194092,
"learning_rate": 8e-05,
"loss": 1.6088,
"step": 1566
},
{
"epoch": 0.3254413291796469,
"grad_norm": 0.5137510299682617,
"learning_rate": 8e-05,
"loss": 1.6697,
"step": 1567
},
{
"epoch": 0.3256490134994808,
"grad_norm": 0.48702284693717957,
"learning_rate": 8e-05,
"loss": 1.5483,
"step": 1568
},
{
"epoch": 0.32585669781931464,
"grad_norm": 0.4997518062591553,
"learning_rate": 8e-05,
"loss": 1.5455,
"step": 1569
},
{
"epoch": 0.3260643821391485,
"grad_norm": 0.4797830283641815,
"learning_rate": 8e-05,
"loss": 1.6014,
"step": 1570
},
{
"epoch": 0.3262720664589823,
"grad_norm": 0.4881299138069153,
"learning_rate": 8e-05,
"loss": 1.5972,
"step": 1571
},
{
"epoch": 0.3264797507788162,
"grad_norm": 0.4965881407260895,
"learning_rate": 8e-05,
"loss": 1.5945,
"step": 1572
},
{
"epoch": 0.32668743509865006,
"grad_norm": 0.49967294931411743,
"learning_rate": 8e-05,
"loss": 1.5934,
"step": 1573
},
{
"epoch": 0.3268951194184839,
"grad_norm": 0.5094643831253052,
"learning_rate": 8e-05,
"loss": 1.6762,
"step": 1574
},
{
"epoch": 0.32710280373831774,
"grad_norm": 0.4757131040096283,
"learning_rate": 8e-05,
"loss": 1.6471,
"step": 1575
},
{
"epoch": 0.3273104880581516,
"grad_norm": 0.5002568364143372,
"learning_rate": 8e-05,
"loss": 1.538,
"step": 1576
},
{
"epoch": 0.3275181723779855,
"grad_norm": 0.5113179087638855,
"learning_rate": 8e-05,
"loss": 1.5968,
"step": 1577
},
{
"epoch": 0.32772585669781934,
"grad_norm": 0.4950655698776245,
"learning_rate": 8e-05,
"loss": 1.6031,
"step": 1578
},
{
"epoch": 0.32793354101765315,
"grad_norm": 0.4921990931034088,
"learning_rate": 8e-05,
"loss": 1.5819,
"step": 1579
},
{
"epoch": 0.328141225337487,
"grad_norm": 0.48755714297294617,
"learning_rate": 8e-05,
"loss": 1.5708,
"step": 1580
},
{
"epoch": 0.3283489096573209,
"grad_norm": 0.49338051676750183,
"learning_rate": 8e-05,
"loss": 1.6067,
"step": 1581
},
{
"epoch": 0.3285565939771547,
"grad_norm": 0.4969124495983124,
"learning_rate": 8e-05,
"loss": 1.5753,
"step": 1582
},
{
"epoch": 0.32876427829698857,
"grad_norm": 0.49120715260505676,
"learning_rate": 8e-05,
"loss": 1.5947,
"step": 1583
},
{
"epoch": 0.32897196261682243,
"grad_norm": 0.47977736592292786,
"learning_rate": 8e-05,
"loss": 1.5577,
"step": 1584
},
{
"epoch": 0.3291796469366563,
"grad_norm": 0.49589675664901733,
"learning_rate": 8e-05,
"loss": 1.6218,
"step": 1585
},
{
"epoch": 0.3293873312564901,
"grad_norm": 0.506141185760498,
"learning_rate": 8e-05,
"loss": 1.5925,
"step": 1586
},
{
"epoch": 0.329595015576324,
"grad_norm": 0.5115627646446228,
"learning_rate": 8e-05,
"loss": 1.6107,
"step": 1587
},
{
"epoch": 0.32980269989615785,
"grad_norm": 0.5363782644271851,
"learning_rate": 8e-05,
"loss": 1.6134,
"step": 1588
},
{
"epoch": 0.3300103842159917,
"grad_norm": 0.5303878784179688,
"learning_rate": 8e-05,
"loss": 1.6054,
"step": 1589
},
{
"epoch": 0.3302180685358255,
"grad_norm": 0.5335170030593872,
"learning_rate": 8e-05,
"loss": 1.5758,
"step": 1590
},
{
"epoch": 0.3304257528556594,
"grad_norm": 0.49111613631248474,
"learning_rate": 8e-05,
"loss": 1.6672,
"step": 1591
},
{
"epoch": 0.33063343717549326,
"grad_norm": 0.4870830774307251,
"learning_rate": 8e-05,
"loss": 1.6392,
"step": 1592
},
{
"epoch": 0.33084112149532713,
"grad_norm": 0.5026964545249939,
"learning_rate": 8e-05,
"loss": 1.546,
"step": 1593
},
{
"epoch": 0.33104880581516094,
"grad_norm": 0.47665566205978394,
"learning_rate": 8e-05,
"loss": 1.5447,
"step": 1594
},
{
"epoch": 0.3312564901349948,
"grad_norm": 0.4723517894744873,
"learning_rate": 8e-05,
"loss": 1.5622,
"step": 1595
},
{
"epoch": 0.3314641744548287,
"grad_norm": 0.48958736658096313,
"learning_rate": 8e-05,
"loss": 1.5539,
"step": 1596
},
{
"epoch": 0.3316718587746625,
"grad_norm": 0.5126999616622925,
"learning_rate": 8e-05,
"loss": 1.6348,
"step": 1597
},
{
"epoch": 0.33187954309449635,
"grad_norm": 0.5067579746246338,
"learning_rate": 8e-05,
"loss": 1.6046,
"step": 1598
},
{
"epoch": 0.3320872274143302,
"grad_norm": 0.4940493404865265,
"learning_rate": 8e-05,
"loss": 1.5553,
"step": 1599
},
{
"epoch": 0.3322949117341641,
"grad_norm": 0.5140849947929382,
"learning_rate": 8e-05,
"loss": 1.6741,
"step": 1600
},
{
"epoch": 0.3325025960539979,
"grad_norm": 0.47822049260139465,
"learning_rate": 8e-05,
"loss": 1.5474,
"step": 1601
},
{
"epoch": 0.33271028037383177,
"grad_norm": 0.4843752384185791,
"learning_rate": 8e-05,
"loss": 1.5194,
"step": 1602
},
{
"epoch": 0.33291796469366564,
"grad_norm": 0.5137794613838196,
"learning_rate": 8e-05,
"loss": 1.6013,
"step": 1603
},
{
"epoch": 0.3331256490134995,
"grad_norm": 0.4957379698753357,
"learning_rate": 8e-05,
"loss": 1.5426,
"step": 1604
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.527511477470398,
"learning_rate": 8e-05,
"loss": 1.656,
"step": 1605
},
{
"epoch": 0.3335410176531672,
"grad_norm": 0.4836502969264984,
"learning_rate": 8e-05,
"loss": 1.5422,
"step": 1606
},
{
"epoch": 0.33374870197300105,
"grad_norm": 0.47433504462242126,
"learning_rate": 8e-05,
"loss": 1.5434,
"step": 1607
},
{
"epoch": 0.3339563862928349,
"grad_norm": 0.5102041363716125,
"learning_rate": 8e-05,
"loss": 1.7392,
"step": 1608
},
{
"epoch": 0.33416407061266873,
"grad_norm": 0.4976068437099457,
"learning_rate": 8e-05,
"loss": 1.6558,
"step": 1609
},
{
"epoch": 0.3343717549325026,
"grad_norm": 0.49436476826667786,
"learning_rate": 8e-05,
"loss": 1.5691,
"step": 1610
},
{
"epoch": 0.33457943925233646,
"grad_norm": 0.4891389310359955,
"learning_rate": 8e-05,
"loss": 1.6462,
"step": 1611
},
{
"epoch": 0.3347871235721703,
"grad_norm": 0.5754526853561401,
"learning_rate": 8e-05,
"loss": 1.6686,
"step": 1612
},
{
"epoch": 0.33499480789200414,
"grad_norm": 0.49320200085639954,
"learning_rate": 8e-05,
"loss": 1.6849,
"step": 1613
},
{
"epoch": 0.335202492211838,
"grad_norm": 0.4995647370815277,
"learning_rate": 8e-05,
"loss": 1.5772,
"step": 1614
},
{
"epoch": 0.3354101765316719,
"grad_norm": 0.48338550329208374,
"learning_rate": 8e-05,
"loss": 1.5217,
"step": 1615
},
{
"epoch": 0.3356178608515057,
"grad_norm": 0.47079768776893616,
"learning_rate": 8e-05,
"loss": 1.5064,
"step": 1616
},
{
"epoch": 0.33582554517133956,
"grad_norm": 0.5078921914100647,
"learning_rate": 8e-05,
"loss": 1.588,
"step": 1617
},
{
"epoch": 0.3360332294911734,
"grad_norm": 0.5282722115516663,
"learning_rate": 8e-05,
"loss": 1.5059,
"step": 1618
},
{
"epoch": 0.3362409138110073,
"grad_norm": 0.5180701613426208,
"learning_rate": 8e-05,
"loss": 1.5696,
"step": 1619
},
{
"epoch": 0.3364485981308411,
"grad_norm": 0.5044119954109192,
"learning_rate": 8e-05,
"loss": 1.5619,
"step": 1620
},
{
"epoch": 0.33665628245067497,
"grad_norm": 0.5039921402931213,
"learning_rate": 8e-05,
"loss": 1.6287,
"step": 1621
},
{
"epoch": 0.33686396677050884,
"grad_norm": 0.5082249641418457,
"learning_rate": 8e-05,
"loss": 1.5873,
"step": 1622
},
{
"epoch": 0.3370716510903427,
"grad_norm": 0.48252761363983154,
"learning_rate": 8e-05,
"loss": 1.5671,
"step": 1623
},
{
"epoch": 0.3372793354101765,
"grad_norm": 0.5083025097846985,
"learning_rate": 8e-05,
"loss": 1.6022,
"step": 1624
},
{
"epoch": 0.3374870197300104,
"grad_norm": 0.5168206095695496,
"learning_rate": 8e-05,
"loss": 1.6703,
"step": 1625
},
{
"epoch": 0.33769470404984425,
"grad_norm": 0.4816582500934601,
"learning_rate": 8e-05,
"loss": 1.5788,
"step": 1626
},
{
"epoch": 0.33790238836967806,
"grad_norm": 0.490945965051651,
"learning_rate": 8e-05,
"loss": 1.5626,
"step": 1627
},
{
"epoch": 0.33811007268951193,
"grad_norm": 0.5070871710777283,
"learning_rate": 8e-05,
"loss": 1.676,
"step": 1628
},
{
"epoch": 0.3383177570093458,
"grad_norm": 0.49754798412323,
"learning_rate": 8e-05,
"loss": 1.6187,
"step": 1629
},
{
"epoch": 0.33852544132917967,
"grad_norm": 0.500566303730011,
"learning_rate": 8e-05,
"loss": 1.6224,
"step": 1630
},
{
"epoch": 0.3387331256490135,
"grad_norm": 0.5073097944259644,
"learning_rate": 8e-05,
"loss": 1.6215,
"step": 1631
},
{
"epoch": 0.33894080996884735,
"grad_norm": 0.4712946116924286,
"learning_rate": 8e-05,
"loss": 1.5831,
"step": 1632
},
{
"epoch": 0.3391484942886812,
"grad_norm": 0.49603715538978577,
"learning_rate": 8e-05,
"loss": 1.6233,
"step": 1633
},
{
"epoch": 0.3393561786085151,
"grad_norm": 0.5190393328666687,
"learning_rate": 8e-05,
"loss": 1.6539,
"step": 1634
},
{
"epoch": 0.3395638629283489,
"grad_norm": 0.49072492122650146,
"learning_rate": 8e-05,
"loss": 1.51,
"step": 1635
},
{
"epoch": 0.33977154724818276,
"grad_norm": 0.49581220746040344,
"learning_rate": 8e-05,
"loss": 1.5618,
"step": 1636
},
{
"epoch": 0.3399792315680166,
"grad_norm": 0.4976690113544464,
"learning_rate": 8e-05,
"loss": 1.619,
"step": 1637
},
{
"epoch": 0.3401869158878505,
"grad_norm": 0.4935644865036011,
"learning_rate": 8e-05,
"loss": 1.6651,
"step": 1638
},
{
"epoch": 0.3403946002076843,
"grad_norm": 0.5480844974517822,
"learning_rate": 8e-05,
"loss": 1.7232,
"step": 1639
},
{
"epoch": 0.3406022845275182,
"grad_norm": 0.5207673907279968,
"learning_rate": 8e-05,
"loss": 1.646,
"step": 1640
},
{
"epoch": 0.34080996884735204,
"grad_norm": 0.5026153326034546,
"learning_rate": 8e-05,
"loss": 1.5155,
"step": 1641
},
{
"epoch": 0.34101765316718585,
"grad_norm": 0.4776262044906616,
"learning_rate": 8e-05,
"loss": 1.5248,
"step": 1642
},
{
"epoch": 0.3412253374870197,
"grad_norm": 0.5081517100334167,
"learning_rate": 8e-05,
"loss": 1.6524,
"step": 1643
},
{
"epoch": 0.3414330218068536,
"grad_norm": 0.4948354661464691,
"learning_rate": 8e-05,
"loss": 1.6231,
"step": 1644
},
{
"epoch": 0.34164070612668745,
"grad_norm": 0.5102015733718872,
"learning_rate": 8e-05,
"loss": 1.6152,
"step": 1645
},
{
"epoch": 0.34184839044652127,
"grad_norm": 0.5125258564949036,
"learning_rate": 8e-05,
"loss": 1.644,
"step": 1646
},
{
"epoch": 0.34205607476635513,
"grad_norm": 0.49975356459617615,
"learning_rate": 8e-05,
"loss": 1.5621,
"step": 1647
},
{
"epoch": 0.342263759086189,
"grad_norm": 0.5100876092910767,
"learning_rate": 8e-05,
"loss": 1.5948,
"step": 1648
},
{
"epoch": 0.34247144340602287,
"grad_norm": 0.4993584156036377,
"learning_rate": 8e-05,
"loss": 1.5826,
"step": 1649
},
{
"epoch": 0.3426791277258567,
"grad_norm": 0.4974980652332306,
"learning_rate": 8e-05,
"loss": 1.5497,
"step": 1650
},
{
"epoch": 0.34288681204569055,
"grad_norm": 0.5110526084899902,
"learning_rate": 8e-05,
"loss": 1.5656,
"step": 1651
},
{
"epoch": 0.3430944963655244,
"grad_norm": 0.4917437434196472,
"learning_rate": 8e-05,
"loss": 1.5912,
"step": 1652
},
{
"epoch": 0.3433021806853583,
"grad_norm": 0.5046085119247437,
"learning_rate": 8e-05,
"loss": 1.6313,
"step": 1653
},
{
"epoch": 0.3435098650051921,
"grad_norm": 0.5008227229118347,
"learning_rate": 8e-05,
"loss": 1.5603,
"step": 1654
},
{
"epoch": 0.34371754932502596,
"grad_norm": 0.48017212748527527,
"learning_rate": 8e-05,
"loss": 1.5851,
"step": 1655
},
{
"epoch": 0.34392523364485983,
"grad_norm": 0.5130458474159241,
"learning_rate": 8e-05,
"loss": 1.6015,
"step": 1656
},
{
"epoch": 0.34413291796469364,
"grad_norm": 0.5011090636253357,
"learning_rate": 8e-05,
"loss": 1.586,
"step": 1657
},
{
"epoch": 0.3443406022845275,
"grad_norm": 0.4897594451904297,
"learning_rate": 8e-05,
"loss": 1.5021,
"step": 1658
},
{
"epoch": 0.3445482866043614,
"grad_norm": 0.6089925765991211,
"learning_rate": 8e-05,
"loss": 1.6325,
"step": 1659
},
{
"epoch": 0.34475597092419524,
"grad_norm": 0.5230244994163513,
"learning_rate": 8e-05,
"loss": 1.6087,
"step": 1660
},
{
"epoch": 0.34496365524402905,
"grad_norm": 0.5000819563865662,
"learning_rate": 8e-05,
"loss": 1.6082,
"step": 1661
},
{
"epoch": 0.3451713395638629,
"grad_norm": 0.5213269591331482,
"learning_rate": 8e-05,
"loss": 1.645,
"step": 1662
},
{
"epoch": 0.3453790238836968,
"grad_norm": 0.4994923174381256,
"learning_rate": 8e-05,
"loss": 1.5727,
"step": 1663
},
{
"epoch": 0.34558670820353066,
"grad_norm": 0.5164852142333984,
"learning_rate": 8e-05,
"loss": 1.6262,
"step": 1664
},
{
"epoch": 0.34579439252336447,
"grad_norm": 0.5150447487831116,
"learning_rate": 8e-05,
"loss": 1.6247,
"step": 1665
},
{
"epoch": 0.34600207684319834,
"grad_norm": 0.5243363976478577,
"learning_rate": 8e-05,
"loss": 1.6594,
"step": 1666
},
{
"epoch": 0.3462097611630322,
"grad_norm": 0.48954543471336365,
"learning_rate": 8e-05,
"loss": 1.5796,
"step": 1667
},
{
"epoch": 0.34641744548286607,
"grad_norm": 0.5317123532295227,
"learning_rate": 8e-05,
"loss": 1.6389,
"step": 1668
},
{
"epoch": 0.3466251298026999,
"grad_norm": 0.48210567235946655,
"learning_rate": 8e-05,
"loss": 1.611,
"step": 1669
},
{
"epoch": 0.34683281412253375,
"grad_norm": 0.48340731859207153,
"learning_rate": 8e-05,
"loss": 1.5521,
"step": 1670
},
{
"epoch": 0.3470404984423676,
"grad_norm": 0.5049550533294678,
"learning_rate": 8e-05,
"loss": 1.5751,
"step": 1671
},
{
"epoch": 0.34724818276220143,
"grad_norm": 0.5040217638015747,
"learning_rate": 8e-05,
"loss": 1.5263,
"step": 1672
},
{
"epoch": 0.3474558670820353,
"grad_norm": 0.5133804678916931,
"learning_rate": 8e-05,
"loss": 1.651,
"step": 1673
},
{
"epoch": 0.34766355140186916,
"grad_norm": 0.4757126569747925,
"learning_rate": 8e-05,
"loss": 1.5168,
"step": 1674
},
{
"epoch": 0.34787123572170303,
"grad_norm": 0.5413657426834106,
"learning_rate": 8e-05,
"loss": 1.6296,
"step": 1675
},
{
"epoch": 0.34807892004153684,
"grad_norm": 0.5213454961776733,
"learning_rate": 8e-05,
"loss": 1.5978,
"step": 1676
},
{
"epoch": 0.3482866043613707,
"grad_norm": 0.4883634150028229,
"learning_rate": 8e-05,
"loss": 1.6488,
"step": 1677
},
{
"epoch": 0.3484942886812046,
"grad_norm": 0.5572940111160278,
"learning_rate": 8e-05,
"loss": 1.5427,
"step": 1678
},
{
"epoch": 0.34870197300103845,
"grad_norm": 0.5513511300086975,
"learning_rate": 8e-05,
"loss": 1.648,
"step": 1679
},
{
"epoch": 0.34890965732087226,
"grad_norm": 0.5172803401947021,
"learning_rate": 8e-05,
"loss": 1.6542,
"step": 1680
},
{
"epoch": 0.3491173416407061,
"grad_norm": 0.5045535564422607,
"learning_rate": 8e-05,
"loss": 1.5983,
"step": 1681
},
{
"epoch": 0.34932502596054,
"grad_norm": 0.49136364459991455,
"learning_rate": 8e-05,
"loss": 1.5774,
"step": 1682
},
{
"epoch": 0.34953271028037386,
"grad_norm": 0.4870191216468811,
"learning_rate": 8e-05,
"loss": 1.5807,
"step": 1683
},
{
"epoch": 0.34974039460020767,
"grad_norm": 0.524327278137207,
"learning_rate": 8e-05,
"loss": 1.6471,
"step": 1684
},
{
"epoch": 0.34994807892004154,
"grad_norm": 0.5097550749778748,
"learning_rate": 8e-05,
"loss": 1.5905,
"step": 1685
},
{
"epoch": 0.3501557632398754,
"grad_norm": 0.516825795173645,
"learning_rate": 8e-05,
"loss": 1.6539,
"step": 1686
},
{
"epoch": 0.3503634475597092,
"grad_norm": 0.5085751414299011,
"learning_rate": 8e-05,
"loss": 1.6649,
"step": 1687
},
{
"epoch": 0.3505711318795431,
"grad_norm": 0.49749553203582764,
"learning_rate": 8e-05,
"loss": 1.5978,
"step": 1688
},
{
"epoch": 0.35077881619937695,
"grad_norm": 0.49809882044792175,
"learning_rate": 8e-05,
"loss": 1.6228,
"step": 1689
},
{
"epoch": 0.3509865005192108,
"grad_norm": 0.48347166180610657,
"learning_rate": 8e-05,
"loss": 1.5463,
"step": 1690
},
{
"epoch": 0.35119418483904463,
"grad_norm": 0.4847400486469269,
"learning_rate": 8e-05,
"loss": 1.5379,
"step": 1691
},
{
"epoch": 0.3514018691588785,
"grad_norm": 0.48331671953201294,
"learning_rate": 8e-05,
"loss": 1.5704,
"step": 1692
},
{
"epoch": 0.35160955347871237,
"grad_norm": 0.49434080719947815,
"learning_rate": 8e-05,
"loss": 1.6055,
"step": 1693
},
{
"epoch": 0.35181723779854623,
"grad_norm": 0.5171917676925659,
"learning_rate": 8e-05,
"loss": 1.6155,
"step": 1694
},
{
"epoch": 0.35202492211838005,
"grad_norm": 0.4984574317932129,
"learning_rate": 8e-05,
"loss": 1.5811,
"step": 1695
},
{
"epoch": 0.3522326064382139,
"grad_norm": 0.4882890284061432,
"learning_rate": 8e-05,
"loss": 1.5948,
"step": 1696
},
{
"epoch": 0.3524402907580478,
"grad_norm": 0.48760464787483215,
"learning_rate": 8e-05,
"loss": 1.5775,
"step": 1697
},
{
"epoch": 0.3526479750778816,
"grad_norm": 0.48519834876060486,
"learning_rate": 8e-05,
"loss": 1.5734,
"step": 1698
},
{
"epoch": 0.35285565939771546,
"grad_norm": 0.518786609172821,
"learning_rate": 8e-05,
"loss": 1.5969,
"step": 1699
},
{
"epoch": 0.3530633437175493,
"grad_norm": 0.4933086335659027,
"learning_rate": 8e-05,
"loss": 1.5626,
"step": 1700
},
{
"epoch": 0.3532710280373832,
"grad_norm": 0.5093357563018799,
"learning_rate": 8e-05,
"loss": 1.5513,
"step": 1701
},
{
"epoch": 0.353478712357217,
"grad_norm": 0.5171862244606018,
"learning_rate": 8e-05,
"loss": 1.5887,
"step": 1702
},
{
"epoch": 0.3536863966770509,
"grad_norm": 0.5157456994056702,
"learning_rate": 8e-05,
"loss": 1.6256,
"step": 1703
},
{
"epoch": 0.35389408099688474,
"grad_norm": 0.5482586622238159,
"learning_rate": 8e-05,
"loss": 1.6649,
"step": 1704
},
{
"epoch": 0.3541017653167186,
"grad_norm": 0.5034733414649963,
"learning_rate": 8e-05,
"loss": 1.617,
"step": 1705
},
{
"epoch": 0.3543094496365524,
"grad_norm": 0.5075942277908325,
"learning_rate": 8e-05,
"loss": 1.6087,
"step": 1706
},
{
"epoch": 0.3545171339563863,
"grad_norm": 0.5159339904785156,
"learning_rate": 8e-05,
"loss": 1.5778,
"step": 1707
},
{
"epoch": 0.35472481827622016,
"grad_norm": 0.5066738724708557,
"learning_rate": 8e-05,
"loss": 1.6548,
"step": 1708
},
{
"epoch": 0.354932502596054,
"grad_norm": 0.48170992732048035,
"learning_rate": 8e-05,
"loss": 1.6448,
"step": 1709
},
{
"epoch": 0.35514018691588783,
"grad_norm": 0.47656309604644775,
"learning_rate": 8e-05,
"loss": 1.5139,
"step": 1710
},
{
"epoch": 0.3553478712357217,
"grad_norm": 0.5038301944732666,
"learning_rate": 8e-05,
"loss": 1.5363,
"step": 1711
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.5651991963386536,
"learning_rate": 8e-05,
"loss": 1.6422,
"step": 1712
},
{
"epoch": 0.3557632398753894,
"grad_norm": 0.49263623356819153,
"learning_rate": 8e-05,
"loss": 1.5621,
"step": 1713
},
{
"epoch": 0.35597092419522325,
"grad_norm": 0.5081530809402466,
"learning_rate": 8e-05,
"loss": 1.5904,
"step": 1714
},
{
"epoch": 0.3561786085150571,
"grad_norm": 0.5229151844978333,
"learning_rate": 8e-05,
"loss": 1.6046,
"step": 1715
},
{
"epoch": 0.356386292834891,
"grad_norm": 0.4824669361114502,
"learning_rate": 8e-05,
"loss": 1.5738,
"step": 1716
},
{
"epoch": 0.3565939771547248,
"grad_norm": 0.5122133493423462,
"learning_rate": 8e-05,
"loss": 1.635,
"step": 1717
},
{
"epoch": 0.35680166147455866,
"grad_norm": 0.4866729974746704,
"learning_rate": 8e-05,
"loss": 1.5987,
"step": 1718
},
{
"epoch": 0.35700934579439253,
"grad_norm": 0.5040173530578613,
"learning_rate": 8e-05,
"loss": 1.5918,
"step": 1719
},
{
"epoch": 0.3572170301142264,
"grad_norm": 0.496620774269104,
"learning_rate": 8e-05,
"loss": 1.5897,
"step": 1720
},
{
"epoch": 0.3574247144340602,
"grad_norm": 0.521233081817627,
"learning_rate": 8e-05,
"loss": 1.6024,
"step": 1721
},
{
"epoch": 0.3576323987538941,
"grad_norm": 0.4817935526371002,
"learning_rate": 8e-05,
"loss": 1.5798,
"step": 1722
},
{
"epoch": 0.35784008307372794,
"grad_norm": 0.5107322335243225,
"learning_rate": 8e-05,
"loss": 1.6237,
"step": 1723
},
{
"epoch": 0.3580477673935618,
"grad_norm": 0.49109092354774475,
"learning_rate": 8e-05,
"loss": 1.5852,
"step": 1724
},
{
"epoch": 0.3582554517133956,
"grad_norm": 0.49932169914245605,
"learning_rate": 8e-05,
"loss": 1.5684,
"step": 1725
},
{
"epoch": 0.3584631360332295,
"grad_norm": 0.4933273494243622,
"learning_rate": 8e-05,
"loss": 1.5987,
"step": 1726
},
{
"epoch": 0.35867082035306336,
"grad_norm": 0.5152563452720642,
"learning_rate": 8e-05,
"loss": 1.6229,
"step": 1727
},
{
"epoch": 0.35887850467289717,
"grad_norm": 0.49047115445137024,
"learning_rate": 8e-05,
"loss": 1.5985,
"step": 1728
},
{
"epoch": 0.35908618899273104,
"grad_norm": 0.5049684643745422,
"learning_rate": 8e-05,
"loss": 1.5635,
"step": 1729
},
{
"epoch": 0.3592938733125649,
"grad_norm": 0.4843440055847168,
"learning_rate": 8e-05,
"loss": 1.6021,
"step": 1730
},
{
"epoch": 0.35950155763239877,
"grad_norm": 0.48820793628692627,
"learning_rate": 8e-05,
"loss": 1.5598,
"step": 1731
},
{
"epoch": 0.3597092419522326,
"grad_norm": 0.5228747725486755,
"learning_rate": 8e-05,
"loss": 1.5545,
"step": 1732
},
{
"epoch": 0.35991692627206645,
"grad_norm": 0.4973585903644562,
"learning_rate": 8e-05,
"loss": 1.5758,
"step": 1733
},
{
"epoch": 0.3601246105919003,
"grad_norm": 0.5158231258392334,
"learning_rate": 8e-05,
"loss": 1.6215,
"step": 1734
},
{
"epoch": 0.3603322949117342,
"grad_norm": 0.4845174551010132,
"learning_rate": 8e-05,
"loss": 1.5932,
"step": 1735
},
{
"epoch": 0.360539979231568,
"grad_norm": 0.505311906337738,
"learning_rate": 8e-05,
"loss": 1.6716,
"step": 1736
},
{
"epoch": 0.36074766355140186,
"grad_norm": 0.5100334286689758,
"learning_rate": 8e-05,
"loss": 1.6133,
"step": 1737
},
{
"epoch": 0.36095534787123573,
"grad_norm": 0.510661244392395,
"learning_rate": 8e-05,
"loss": 1.6705,
"step": 1738
},
{
"epoch": 0.3611630321910696,
"grad_norm": 0.49372726678848267,
"learning_rate": 8e-05,
"loss": 1.6432,
"step": 1739
},
{
"epoch": 0.3613707165109034,
"grad_norm": 0.4995138645172119,
"learning_rate": 8e-05,
"loss": 1.5815,
"step": 1740
},
{
"epoch": 0.3615784008307373,
"grad_norm": 0.518836259841919,
"learning_rate": 8e-05,
"loss": 1.5321,
"step": 1741
},
{
"epoch": 0.36178608515057115,
"grad_norm": 0.5100981593132019,
"learning_rate": 8e-05,
"loss": 1.573,
"step": 1742
},
{
"epoch": 0.36199376947040496,
"grad_norm": 0.4908148944377899,
"learning_rate": 8e-05,
"loss": 1.6022,
"step": 1743
},
{
"epoch": 0.3622014537902388,
"grad_norm": 0.5129155516624451,
"learning_rate": 8e-05,
"loss": 1.5825,
"step": 1744
},
{
"epoch": 0.3624091381100727,
"grad_norm": 0.4997279942035675,
"learning_rate": 8e-05,
"loss": 1.672,
"step": 1745
},
{
"epoch": 0.36261682242990656,
"grad_norm": 0.4920547902584076,
"learning_rate": 8e-05,
"loss": 1.5973,
"step": 1746
},
{
"epoch": 0.36282450674974037,
"grad_norm": 0.5037687420845032,
"learning_rate": 8e-05,
"loss": 1.4707,
"step": 1747
},
{
"epoch": 0.36303219106957424,
"grad_norm": 0.5036182999610901,
"learning_rate": 8e-05,
"loss": 1.6161,
"step": 1748
},
{
"epoch": 0.3632398753894081,
"grad_norm": 0.5063603520393372,
"learning_rate": 8e-05,
"loss": 1.5124,
"step": 1749
},
{
"epoch": 0.363447559709242,
"grad_norm": 0.5060688257217407,
"learning_rate": 8e-05,
"loss": 1.553,
"step": 1750
},
{
"epoch": 0.3636552440290758,
"grad_norm": 0.4944371283054352,
"learning_rate": 8e-05,
"loss": 1.572,
"step": 1751
},
{
"epoch": 0.36386292834890965,
"grad_norm": 0.5195178985595703,
"learning_rate": 8e-05,
"loss": 1.5257,
"step": 1752
},
{
"epoch": 0.3640706126687435,
"grad_norm": 0.4922371208667755,
"learning_rate": 8e-05,
"loss": 1.5622,
"step": 1753
},
{
"epoch": 0.3642782969885774,
"grad_norm": 0.5210584998130798,
"learning_rate": 8e-05,
"loss": 1.6212,
"step": 1754
},
{
"epoch": 0.3644859813084112,
"grad_norm": 0.5490279197692871,
"learning_rate": 8e-05,
"loss": 1.6151,
"step": 1755
},
{
"epoch": 0.36469366562824507,
"grad_norm": 0.4872822165489197,
"learning_rate": 8e-05,
"loss": 1.5555,
"step": 1756
},
{
"epoch": 0.36490134994807893,
"grad_norm": 0.5005167126655579,
"learning_rate": 8e-05,
"loss": 1.5788,
"step": 1757
},
{
"epoch": 0.36510903426791275,
"grad_norm": 0.4953737258911133,
"learning_rate": 8e-05,
"loss": 1.6162,
"step": 1758
},
{
"epoch": 0.3653167185877466,
"grad_norm": 0.5173213481903076,
"learning_rate": 8e-05,
"loss": 1.569,
"step": 1759
},
{
"epoch": 0.3655244029075805,
"grad_norm": 0.5120421648025513,
"learning_rate": 8e-05,
"loss": 1.6336,
"step": 1760
},
{
"epoch": 0.36573208722741435,
"grad_norm": 0.5063916444778442,
"learning_rate": 8e-05,
"loss": 1.5695,
"step": 1761
},
{
"epoch": 0.36593977154724816,
"grad_norm": 0.5345103144645691,
"learning_rate": 8e-05,
"loss": 1.5974,
"step": 1762
},
{
"epoch": 0.36614745586708203,
"grad_norm": 0.47018736600875854,
"learning_rate": 8e-05,
"loss": 1.507,
"step": 1763
},
{
"epoch": 0.3663551401869159,
"grad_norm": 0.5206469297409058,
"learning_rate": 8e-05,
"loss": 1.5425,
"step": 1764
},
{
"epoch": 0.36656282450674976,
"grad_norm": 0.49119699001312256,
"learning_rate": 8e-05,
"loss": 1.5484,
"step": 1765
},
{
"epoch": 0.3667705088265836,
"grad_norm": 0.48123517632484436,
"learning_rate": 8e-05,
"loss": 1.5174,
"step": 1766
},
{
"epoch": 0.36697819314641744,
"grad_norm": 0.5121713280677795,
"learning_rate": 8e-05,
"loss": 1.6451,
"step": 1767
},
{
"epoch": 0.3671858774662513,
"grad_norm": 0.5062276124954224,
"learning_rate": 8e-05,
"loss": 1.5787,
"step": 1768
},
{
"epoch": 0.3673935617860852,
"grad_norm": 0.4850326180458069,
"learning_rate": 8e-05,
"loss": 1.605,
"step": 1769
},
{
"epoch": 0.367601246105919,
"grad_norm": 0.5524799823760986,
"learning_rate": 8e-05,
"loss": 1.6658,
"step": 1770
},
{
"epoch": 0.36780893042575286,
"grad_norm": 0.5042113661766052,
"learning_rate": 8e-05,
"loss": 1.5878,
"step": 1771
},
{
"epoch": 0.3680166147455867,
"grad_norm": 0.5291147828102112,
"learning_rate": 8e-05,
"loss": 1.6076,
"step": 1772
},
{
"epoch": 0.36822429906542054,
"grad_norm": 0.5282319188117981,
"learning_rate": 8e-05,
"loss": 1.6024,
"step": 1773
},
{
"epoch": 0.3684319833852544,
"grad_norm": 0.5120516419410706,
"learning_rate": 8e-05,
"loss": 1.5431,
"step": 1774
},
{
"epoch": 0.36863966770508827,
"grad_norm": 0.5158196687698364,
"learning_rate": 8e-05,
"loss": 1.6004,
"step": 1775
},
{
"epoch": 0.36884735202492214,
"grad_norm": 0.50716632604599,
"learning_rate": 8e-05,
"loss": 1.6615,
"step": 1776
},
{
"epoch": 0.36905503634475595,
"grad_norm": 0.5380442142486572,
"learning_rate": 8e-05,
"loss": 1.5219,
"step": 1777
},
{
"epoch": 0.3692627206645898,
"grad_norm": 0.4817923903465271,
"learning_rate": 8e-05,
"loss": 1.5422,
"step": 1778
},
{
"epoch": 0.3694704049844237,
"grad_norm": 0.509099006652832,
"learning_rate": 8e-05,
"loss": 1.5917,
"step": 1779
},
{
"epoch": 0.36967808930425755,
"grad_norm": 0.5473061800003052,
"learning_rate": 8e-05,
"loss": 1.6635,
"step": 1780
},
{
"epoch": 0.36988577362409136,
"grad_norm": 0.5307387709617615,
"learning_rate": 8e-05,
"loss": 1.6171,
"step": 1781
},
{
"epoch": 0.37009345794392523,
"grad_norm": 0.5154988169670105,
"learning_rate": 8e-05,
"loss": 1.5596,
"step": 1782
},
{
"epoch": 0.3703011422637591,
"grad_norm": 0.4852389097213745,
"learning_rate": 8e-05,
"loss": 1.5276,
"step": 1783
},
{
"epoch": 0.37050882658359297,
"grad_norm": 0.4957145154476166,
"learning_rate": 8e-05,
"loss": 1.5823,
"step": 1784
},
{
"epoch": 0.3707165109034268,
"grad_norm": 0.5096319317817688,
"learning_rate": 8e-05,
"loss": 1.5288,
"step": 1785
},
{
"epoch": 0.37092419522326064,
"grad_norm": 0.5101845860481262,
"learning_rate": 8e-05,
"loss": 1.6088,
"step": 1786
},
{
"epoch": 0.3711318795430945,
"grad_norm": 0.5254088640213013,
"learning_rate": 8e-05,
"loss": 1.6396,
"step": 1787
},
{
"epoch": 0.3713395638629283,
"grad_norm": 0.5071054697036743,
"learning_rate": 8e-05,
"loss": 1.569,
"step": 1788
},
{
"epoch": 0.3715472481827622,
"grad_norm": 0.51131272315979,
"learning_rate": 8e-05,
"loss": 1.6019,
"step": 1789
},
{
"epoch": 0.37175493250259606,
"grad_norm": 0.5088433623313904,
"learning_rate": 8e-05,
"loss": 1.5169,
"step": 1790
},
{
"epoch": 0.3719626168224299,
"grad_norm": 0.5493926405906677,
"learning_rate": 8e-05,
"loss": 1.595,
"step": 1791
},
{
"epoch": 0.37217030114226374,
"grad_norm": 0.4982267916202545,
"learning_rate": 8e-05,
"loss": 1.5716,
"step": 1792
},
{
"epoch": 0.3723779854620976,
"grad_norm": 0.5942201614379883,
"learning_rate": 8e-05,
"loss": 1.6425,
"step": 1793
},
{
"epoch": 0.37258566978193147,
"grad_norm": 0.4930921196937561,
"learning_rate": 8e-05,
"loss": 1.5942,
"step": 1794
},
{
"epoch": 0.37279335410176534,
"grad_norm": 0.5201108455657959,
"learning_rate": 8e-05,
"loss": 1.6025,
"step": 1795
},
{
"epoch": 0.37300103842159915,
"grad_norm": 0.5413525104522705,
"learning_rate": 8e-05,
"loss": 1.6227,
"step": 1796
},
{
"epoch": 0.373208722741433,
"grad_norm": 0.4801349937915802,
"learning_rate": 8e-05,
"loss": 1.5519,
"step": 1797
},
{
"epoch": 0.3734164070612669,
"grad_norm": 0.545444905757904,
"learning_rate": 8e-05,
"loss": 1.5614,
"step": 1798
},
{
"epoch": 0.37362409138110075,
"grad_norm": 0.5094887614250183,
"learning_rate": 8e-05,
"loss": 1.5961,
"step": 1799
},
{
"epoch": 0.37383177570093457,
"grad_norm": 0.5024911761283875,
"learning_rate": 8e-05,
"loss": 1.6617,
"step": 1800
},
{
"epoch": 0.37403946002076843,
"grad_norm": 0.4914880692958832,
"learning_rate": 8e-05,
"loss": 1.547,
"step": 1801
},
{
"epoch": 0.3742471443406023,
"grad_norm": 0.488719642162323,
"learning_rate": 8e-05,
"loss": 1.5906,
"step": 1802
},
{
"epoch": 0.3744548286604361,
"grad_norm": 0.5118622779846191,
"learning_rate": 8e-05,
"loss": 1.5741,
"step": 1803
},
{
"epoch": 0.37466251298027,
"grad_norm": 0.5298313498497009,
"learning_rate": 8e-05,
"loss": 1.6255,
"step": 1804
},
{
"epoch": 0.37487019730010385,
"grad_norm": 0.46859586238861084,
"learning_rate": 8e-05,
"loss": 1.5109,
"step": 1805
},
{
"epoch": 0.3750778816199377,
"grad_norm": 0.5379807949066162,
"learning_rate": 8e-05,
"loss": 1.6255,
"step": 1806
},
{
"epoch": 0.3752855659397715,
"grad_norm": 0.4950341284275055,
"learning_rate": 8e-05,
"loss": 1.659,
"step": 1807
},
{
"epoch": 0.3754932502596054,
"grad_norm": 0.49582797288894653,
"learning_rate": 8e-05,
"loss": 1.6734,
"step": 1808
},
{
"epoch": 0.37570093457943926,
"grad_norm": 0.522197961807251,
"learning_rate": 8e-05,
"loss": 1.5411,
"step": 1809
},
{
"epoch": 0.37590861889927313,
"grad_norm": 0.49645760655403137,
"learning_rate": 8e-05,
"loss": 1.5901,
"step": 1810
},
{
"epoch": 0.37611630321910694,
"grad_norm": 0.5847791433334351,
"learning_rate": 8e-05,
"loss": 1.5925,
"step": 1811
},
{
"epoch": 0.3763239875389408,
"grad_norm": 0.4936416447162628,
"learning_rate": 8e-05,
"loss": 1.6004,
"step": 1812
},
{
"epoch": 0.3765316718587747,
"grad_norm": 0.5401305556297302,
"learning_rate": 8e-05,
"loss": 1.6375,
"step": 1813
},
{
"epoch": 0.37673935617860854,
"grad_norm": 0.5169132351875305,
"learning_rate": 8e-05,
"loss": 1.6277,
"step": 1814
},
{
"epoch": 0.37694704049844235,
"grad_norm": 0.5267390012741089,
"learning_rate": 8e-05,
"loss": 1.595,
"step": 1815
},
{
"epoch": 0.3771547248182762,
"grad_norm": 0.5062999129295349,
"learning_rate": 8e-05,
"loss": 1.6584,
"step": 1816
},
{
"epoch": 0.3773624091381101,
"grad_norm": 0.4729461967945099,
"learning_rate": 8e-05,
"loss": 1.5783,
"step": 1817
},
{
"epoch": 0.3775700934579439,
"grad_norm": 0.4871976375579834,
"learning_rate": 8e-05,
"loss": 1.6477,
"step": 1818
},
{
"epoch": 0.37777777777777777,
"grad_norm": 0.4819161593914032,
"learning_rate": 8e-05,
"loss": 1.5632,
"step": 1819
},
{
"epoch": 0.37798546209761164,
"grad_norm": 0.48835310339927673,
"learning_rate": 8e-05,
"loss": 1.6809,
"step": 1820
},
{
"epoch": 0.3781931464174455,
"grad_norm": 0.5206708908081055,
"learning_rate": 8e-05,
"loss": 1.6919,
"step": 1821
},
{
"epoch": 0.3784008307372793,
"grad_norm": 0.49859198927879333,
"learning_rate": 8e-05,
"loss": 1.624,
"step": 1822
},
{
"epoch": 0.3786085150571132,
"grad_norm": 0.47674262523651123,
"learning_rate": 8e-05,
"loss": 1.4634,
"step": 1823
},
{
"epoch": 0.37881619937694705,
"grad_norm": 0.5091409683227539,
"learning_rate": 8e-05,
"loss": 1.5764,
"step": 1824
},
{
"epoch": 0.3790238836967809,
"grad_norm": 0.4878379702568054,
"learning_rate": 8e-05,
"loss": 1.6147,
"step": 1825
},
{
"epoch": 0.37923156801661473,
"grad_norm": 0.4939122200012207,
"learning_rate": 8e-05,
"loss": 1.5762,
"step": 1826
},
{
"epoch": 0.3794392523364486,
"grad_norm": 0.5266447067260742,
"learning_rate": 8e-05,
"loss": 1.6838,
"step": 1827
},
{
"epoch": 0.37964693665628246,
"grad_norm": 0.49246594309806824,
"learning_rate": 8e-05,
"loss": 1.561,
"step": 1828
},
{
"epoch": 0.37985462097611633,
"grad_norm": 0.4985910952091217,
"learning_rate": 8e-05,
"loss": 1.4934,
"step": 1829
},
{
"epoch": 0.38006230529595014,
"grad_norm": 0.49930235743522644,
"learning_rate": 8e-05,
"loss": 1.5857,
"step": 1830
},
{
"epoch": 0.380269989615784,
"grad_norm": 0.5117544531822205,
"learning_rate": 8e-05,
"loss": 1.5451,
"step": 1831
},
{
"epoch": 0.3804776739356179,
"grad_norm": 0.5053813457489014,
"learning_rate": 8e-05,
"loss": 1.5836,
"step": 1832
},
{
"epoch": 0.3806853582554517,
"grad_norm": 0.5296281576156616,
"learning_rate": 8e-05,
"loss": 1.5972,
"step": 1833
},
{
"epoch": 0.38089304257528556,
"grad_norm": 0.5012361407279968,
"learning_rate": 8e-05,
"loss": 1.5809,
"step": 1834
},
{
"epoch": 0.3811007268951194,
"grad_norm": 0.5064777135848999,
"learning_rate": 8e-05,
"loss": 1.5584,
"step": 1835
},
{
"epoch": 0.3813084112149533,
"grad_norm": 0.5177083611488342,
"learning_rate": 8e-05,
"loss": 1.5851,
"step": 1836
},
{
"epoch": 0.3815160955347871,
"grad_norm": 0.5089267492294312,
"learning_rate": 8e-05,
"loss": 1.5429,
"step": 1837
},
{
"epoch": 0.38172377985462097,
"grad_norm": 0.49458834528923035,
"learning_rate": 8e-05,
"loss": 1.6052,
"step": 1838
},
{
"epoch": 0.38193146417445484,
"grad_norm": 0.4878547489643097,
"learning_rate": 8e-05,
"loss": 1.5777,
"step": 1839
},
{
"epoch": 0.3821391484942887,
"grad_norm": 0.5592621564865112,
"learning_rate": 8e-05,
"loss": 1.6285,
"step": 1840
},
{
"epoch": 0.3823468328141225,
"grad_norm": 0.5030962824821472,
"learning_rate": 8e-05,
"loss": 1.5928,
"step": 1841
},
{
"epoch": 0.3825545171339564,
"grad_norm": 0.5183880925178528,
"learning_rate": 8e-05,
"loss": 1.6346,
"step": 1842
},
{
"epoch": 0.38276220145379025,
"grad_norm": 0.4975292980670929,
"learning_rate": 8e-05,
"loss": 1.5236,
"step": 1843
},
{
"epoch": 0.3829698857736241,
"grad_norm": 0.5079745650291443,
"learning_rate": 8e-05,
"loss": 1.5622,
"step": 1844
},
{
"epoch": 0.38317757009345793,
"grad_norm": 0.49533551931381226,
"learning_rate": 8e-05,
"loss": 1.563,
"step": 1845
},
{
"epoch": 0.3833852544132918,
"grad_norm": 0.5062286257743835,
"learning_rate": 8e-05,
"loss": 1.6209,
"step": 1846
},
{
"epoch": 0.38359293873312567,
"grad_norm": 0.49643903970718384,
"learning_rate": 8e-05,
"loss": 1.6075,
"step": 1847
},
{
"epoch": 0.3838006230529595,
"grad_norm": 0.6108715534210205,
"learning_rate": 8e-05,
"loss": 1.6111,
"step": 1848
},
{
"epoch": 0.38400830737279334,
"grad_norm": 0.5051819682121277,
"learning_rate": 8e-05,
"loss": 1.5039,
"step": 1849
},
{
"epoch": 0.3842159916926272,
"grad_norm": 0.5083017945289612,
"learning_rate": 8e-05,
"loss": 1.5674,
"step": 1850
},
{
"epoch": 0.3844236760124611,
"grad_norm": 0.5128228068351746,
"learning_rate": 8e-05,
"loss": 1.6137,
"step": 1851
},
{
"epoch": 0.3846313603322949,
"grad_norm": 0.5230603218078613,
"learning_rate": 8e-05,
"loss": 1.6066,
"step": 1852
},
{
"epoch": 0.38483904465212876,
"grad_norm": 0.527700662612915,
"learning_rate": 8e-05,
"loss": 1.6253,
"step": 1853
},
{
"epoch": 0.3850467289719626,
"grad_norm": 0.567374050617218,
"learning_rate": 8e-05,
"loss": 1.5531,
"step": 1854
},
{
"epoch": 0.3852544132917965,
"grad_norm": 0.4868682622909546,
"learning_rate": 8e-05,
"loss": 1.6318,
"step": 1855
},
{
"epoch": 0.3854620976116303,
"grad_norm": 0.5227965116500854,
"learning_rate": 8e-05,
"loss": 1.5589,
"step": 1856
},
{
"epoch": 0.3856697819314642,
"grad_norm": 0.5066382884979248,
"learning_rate": 8e-05,
"loss": 1.6554,
"step": 1857
},
{
"epoch": 0.38587746625129804,
"grad_norm": 0.49491891264915466,
"learning_rate": 8e-05,
"loss": 1.6015,
"step": 1858
},
{
"epoch": 0.38608515057113185,
"grad_norm": 0.5157635807991028,
"learning_rate": 8e-05,
"loss": 1.6216,
"step": 1859
},
{
"epoch": 0.3862928348909657,
"grad_norm": 0.5314090847969055,
"learning_rate": 8e-05,
"loss": 1.5958,
"step": 1860
},
{
"epoch": 0.3865005192107996,
"grad_norm": 0.48347991704940796,
"learning_rate": 8e-05,
"loss": 1.5142,
"step": 1861
},
{
"epoch": 0.38670820353063345,
"grad_norm": 0.5229157209396362,
"learning_rate": 8e-05,
"loss": 1.5493,
"step": 1862
},
{
"epoch": 0.38691588785046727,
"grad_norm": 0.5444154739379883,
"learning_rate": 8e-05,
"loss": 1.6436,
"step": 1863
},
{
"epoch": 0.38712357217030113,
"grad_norm": 0.5127759575843811,
"learning_rate": 8e-05,
"loss": 1.6032,
"step": 1864
},
{
"epoch": 0.387331256490135,
"grad_norm": 0.4927530884742737,
"learning_rate": 8e-05,
"loss": 1.6051,
"step": 1865
},
{
"epoch": 0.38753894080996887,
"grad_norm": 0.5002699494361877,
"learning_rate": 8e-05,
"loss": 1.6294,
"step": 1866
},
{
"epoch": 0.3877466251298027,
"grad_norm": 0.5165937542915344,
"learning_rate": 8e-05,
"loss": 1.5966,
"step": 1867
},
{
"epoch": 0.38795430944963655,
"grad_norm": 0.5008817315101624,
"learning_rate": 8e-05,
"loss": 1.5651,
"step": 1868
},
{
"epoch": 0.3881619937694704,
"grad_norm": 0.49471011757850647,
"learning_rate": 8e-05,
"loss": 1.5699,
"step": 1869
},
{
"epoch": 0.3883696780893043,
"grad_norm": 0.5040736198425293,
"learning_rate": 8e-05,
"loss": 1.6638,
"step": 1870
},
{
"epoch": 0.3885773624091381,
"grad_norm": 0.4965846538543701,
"learning_rate": 8e-05,
"loss": 1.5604,
"step": 1871
},
{
"epoch": 0.38878504672897196,
"grad_norm": 0.48028695583343506,
"learning_rate": 8e-05,
"loss": 1.5656,
"step": 1872
},
{
"epoch": 0.38899273104880583,
"grad_norm": 0.506149411201477,
"learning_rate": 8e-05,
"loss": 1.5405,
"step": 1873
},
{
"epoch": 0.38920041536863964,
"grad_norm": 0.4821113049983978,
"learning_rate": 8e-05,
"loss": 1.6343,
"step": 1874
},
{
"epoch": 0.3894080996884735,
"grad_norm": 0.545812726020813,
"learning_rate": 8e-05,
"loss": 1.6731,
"step": 1875
},
{
"epoch": 0.3896157840083074,
"grad_norm": 0.5006311535835266,
"learning_rate": 8e-05,
"loss": 1.5129,
"step": 1876
},
{
"epoch": 0.38982346832814124,
"grad_norm": 0.48582524061203003,
"learning_rate": 8e-05,
"loss": 1.5433,
"step": 1877
},
{
"epoch": 0.39003115264797505,
"grad_norm": 0.5167272090911865,
"learning_rate": 8e-05,
"loss": 1.6357,
"step": 1878
},
{
"epoch": 0.3902388369678089,
"grad_norm": 0.5142748951911926,
"learning_rate": 8e-05,
"loss": 1.6098,
"step": 1879
},
{
"epoch": 0.3904465212876428,
"grad_norm": 0.5022456645965576,
"learning_rate": 8e-05,
"loss": 1.5784,
"step": 1880
},
{
"epoch": 0.39065420560747666,
"grad_norm": 0.5111859440803528,
"learning_rate": 8e-05,
"loss": 1.5819,
"step": 1881
},
{
"epoch": 0.39086188992731047,
"grad_norm": 0.49327921867370605,
"learning_rate": 8e-05,
"loss": 1.5932,
"step": 1882
},
{
"epoch": 0.39106957424714434,
"grad_norm": 0.5264003872871399,
"learning_rate": 8e-05,
"loss": 1.615,
"step": 1883
},
{
"epoch": 0.3912772585669782,
"grad_norm": 0.4997003376483917,
"learning_rate": 8e-05,
"loss": 1.5645,
"step": 1884
},
{
"epoch": 0.39148494288681207,
"grad_norm": 0.48707181215286255,
"learning_rate": 8e-05,
"loss": 1.566,
"step": 1885
},
{
"epoch": 0.3916926272066459,
"grad_norm": 0.5190602540969849,
"learning_rate": 8e-05,
"loss": 1.5918,
"step": 1886
},
{
"epoch": 0.39190031152647975,
"grad_norm": 0.5049939155578613,
"learning_rate": 8e-05,
"loss": 1.5857,
"step": 1887
},
{
"epoch": 0.3921079958463136,
"grad_norm": 0.5178766250610352,
"learning_rate": 8e-05,
"loss": 1.6534,
"step": 1888
},
{
"epoch": 0.39231568016614743,
"grad_norm": 0.510301411151886,
"learning_rate": 8e-05,
"loss": 1.6123,
"step": 1889
},
{
"epoch": 0.3925233644859813,
"grad_norm": 0.48783424496650696,
"learning_rate": 8e-05,
"loss": 1.51,
"step": 1890
},
{
"epoch": 0.39273104880581516,
"grad_norm": 0.5162112712860107,
"learning_rate": 8e-05,
"loss": 1.5571,
"step": 1891
},
{
"epoch": 0.39293873312564903,
"grad_norm": 0.5092341303825378,
"learning_rate": 8e-05,
"loss": 1.6414,
"step": 1892
},
{
"epoch": 0.39314641744548284,
"grad_norm": 0.5020235776901245,
"learning_rate": 8e-05,
"loss": 1.626,
"step": 1893
},
{
"epoch": 0.3933541017653167,
"grad_norm": 0.5041497945785522,
"learning_rate": 8e-05,
"loss": 1.5922,
"step": 1894
},
{
"epoch": 0.3935617860851506,
"grad_norm": 0.5252364277839661,
"learning_rate": 8e-05,
"loss": 1.6367,
"step": 1895
},
{
"epoch": 0.39376947040498445,
"grad_norm": 0.5267825126647949,
"learning_rate": 8e-05,
"loss": 1.6483,
"step": 1896
},
{
"epoch": 0.39397715472481826,
"grad_norm": 0.47401174902915955,
"learning_rate": 8e-05,
"loss": 1.4967,
"step": 1897
},
{
"epoch": 0.3941848390446521,
"grad_norm": 0.47972801327705383,
"learning_rate": 8e-05,
"loss": 1.5402,
"step": 1898
},
{
"epoch": 0.394392523364486,
"grad_norm": 0.5107938051223755,
"learning_rate": 8e-05,
"loss": 1.6526,
"step": 1899
},
{
"epoch": 0.39460020768431986,
"grad_norm": 0.4895242154598236,
"learning_rate": 8e-05,
"loss": 1.5815,
"step": 1900
},
{
"epoch": 0.39480789200415367,
"grad_norm": 0.5084724426269531,
"learning_rate": 8e-05,
"loss": 1.5669,
"step": 1901
},
{
"epoch": 0.39501557632398754,
"grad_norm": 0.4843336045742035,
"learning_rate": 8e-05,
"loss": 1.5556,
"step": 1902
},
{
"epoch": 0.3952232606438214,
"grad_norm": 0.5088340044021606,
"learning_rate": 8e-05,
"loss": 1.5571,
"step": 1903
},
{
"epoch": 0.3954309449636552,
"grad_norm": 0.5239298939704895,
"learning_rate": 8e-05,
"loss": 1.6224,
"step": 1904
},
{
"epoch": 0.3956386292834891,
"grad_norm": 0.49526721239089966,
"learning_rate": 8e-05,
"loss": 1.6317,
"step": 1905
},
{
"epoch": 0.39584631360332295,
"grad_norm": 0.5118904113769531,
"learning_rate": 8e-05,
"loss": 1.6292,
"step": 1906
},
{
"epoch": 0.3960539979231568,
"grad_norm": 0.5064014196395874,
"learning_rate": 8e-05,
"loss": 1.6595,
"step": 1907
},
{
"epoch": 0.39626168224299063,
"grad_norm": 0.5004372000694275,
"learning_rate": 8e-05,
"loss": 1.5685,
"step": 1908
},
{
"epoch": 0.3964693665628245,
"grad_norm": 0.5007834434509277,
"learning_rate": 8e-05,
"loss": 1.5655,
"step": 1909
},
{
"epoch": 0.39667705088265837,
"grad_norm": 0.4961642920970917,
"learning_rate": 8e-05,
"loss": 1.5386,
"step": 1910
},
{
"epoch": 0.39688473520249223,
"grad_norm": 0.49714717268943787,
"learning_rate": 8e-05,
"loss": 1.5585,
"step": 1911
},
{
"epoch": 0.39709241952232605,
"grad_norm": 0.5007422566413879,
"learning_rate": 8e-05,
"loss": 1.5251,
"step": 1912
},
{
"epoch": 0.3973001038421599,
"grad_norm": 0.4922661781311035,
"learning_rate": 8e-05,
"loss": 1.5312,
"step": 1913
},
{
"epoch": 0.3975077881619938,
"grad_norm": 0.5031492710113525,
"learning_rate": 8e-05,
"loss": 1.6502,
"step": 1914
},
{
"epoch": 0.39771547248182765,
"grad_norm": 0.4995499551296234,
"learning_rate": 8e-05,
"loss": 1.5721,
"step": 1915
},
{
"epoch": 0.39792315680166146,
"grad_norm": 0.5230512022972107,
"learning_rate": 8e-05,
"loss": 1.6209,
"step": 1916
},
{
"epoch": 0.3981308411214953,
"grad_norm": 0.4955296516418457,
"learning_rate": 8e-05,
"loss": 1.6216,
"step": 1917
},
{
"epoch": 0.3983385254413292,
"grad_norm": 0.49905237555503845,
"learning_rate": 8e-05,
"loss": 1.6325,
"step": 1918
},
{
"epoch": 0.398546209761163,
"grad_norm": 0.49909424781799316,
"learning_rate": 8e-05,
"loss": 1.6182,
"step": 1919
},
{
"epoch": 0.3987538940809969,
"grad_norm": 0.5071685910224915,
"learning_rate": 8e-05,
"loss": 1.6344,
"step": 1920
},
{
"epoch": 0.39896157840083074,
"grad_norm": 0.4874131977558136,
"learning_rate": 8e-05,
"loss": 1.6058,
"step": 1921
},
{
"epoch": 0.3991692627206646,
"grad_norm": 0.4769895374774933,
"learning_rate": 8e-05,
"loss": 1.4976,
"step": 1922
},
{
"epoch": 0.3993769470404984,
"grad_norm": 0.47417256236076355,
"learning_rate": 8e-05,
"loss": 1.4958,
"step": 1923
},
{
"epoch": 0.3995846313603323,
"grad_norm": 0.5021626353263855,
"learning_rate": 8e-05,
"loss": 1.6307,
"step": 1924
},
{
"epoch": 0.39979231568016615,
"grad_norm": 0.48706451058387756,
"learning_rate": 8e-05,
"loss": 1.5546,
"step": 1925
},
{
"epoch": 0.4,
"grad_norm": 0.5021299719810486,
"learning_rate": 8e-05,
"loss": 1.6139,
"step": 1926
},
{
"epoch": 0.40020768431983383,
"grad_norm": 0.4878026247024536,
"learning_rate": 8e-05,
"loss": 1.565,
"step": 1927
},
{
"epoch": 0.4004153686396677,
"grad_norm": 0.4966025650501251,
"learning_rate": 8e-05,
"loss": 1.5577,
"step": 1928
},
{
"epoch": 0.40062305295950157,
"grad_norm": 0.49730899930000305,
"learning_rate": 8e-05,
"loss": 1.5656,
"step": 1929
},
{
"epoch": 0.40083073727933544,
"grad_norm": 0.5028313994407654,
"learning_rate": 8e-05,
"loss": 1.6599,
"step": 1930
},
{
"epoch": 0.40103842159916925,
"grad_norm": 0.5250521898269653,
"learning_rate": 8e-05,
"loss": 1.5481,
"step": 1931
},
{
"epoch": 0.4012461059190031,
"grad_norm": 0.5033625364303589,
"learning_rate": 8e-05,
"loss": 1.5811,
"step": 1932
},
{
"epoch": 0.401453790238837,
"grad_norm": 0.4944176971912384,
"learning_rate": 8e-05,
"loss": 1.619,
"step": 1933
},
{
"epoch": 0.4016614745586708,
"grad_norm": 0.5117709040641785,
"learning_rate": 8e-05,
"loss": 1.5833,
"step": 1934
},
{
"epoch": 0.40186915887850466,
"grad_norm": 0.50387042760849,
"learning_rate": 8e-05,
"loss": 1.5767,
"step": 1935
},
{
"epoch": 0.40207684319833853,
"grad_norm": 0.5241379141807556,
"learning_rate": 8e-05,
"loss": 1.5629,
"step": 1936
},
{
"epoch": 0.4022845275181724,
"grad_norm": 0.5203877687454224,
"learning_rate": 8e-05,
"loss": 1.5862,
"step": 1937
},
{
"epoch": 0.4024922118380062,
"grad_norm": 0.49413588643074036,
"learning_rate": 8e-05,
"loss": 1.5689,
"step": 1938
},
{
"epoch": 0.4026998961578401,
"grad_norm": 0.5159587860107422,
"learning_rate": 8e-05,
"loss": 1.6136,
"step": 1939
},
{
"epoch": 0.40290758047767394,
"grad_norm": 0.5072512626647949,
"learning_rate": 8e-05,
"loss": 1.5555,
"step": 1940
},
{
"epoch": 0.4031152647975078,
"grad_norm": 0.500592052936554,
"learning_rate": 8e-05,
"loss": 1.5846,
"step": 1941
},
{
"epoch": 0.4033229491173416,
"grad_norm": 0.4672267735004425,
"learning_rate": 8e-05,
"loss": 1.5187,
"step": 1942
},
{
"epoch": 0.4035306334371755,
"grad_norm": 0.48535841703414917,
"learning_rate": 8e-05,
"loss": 1.6116,
"step": 1943
},
{
"epoch": 0.40373831775700936,
"grad_norm": 0.48251843452453613,
"learning_rate": 8e-05,
"loss": 1.5747,
"step": 1944
},
{
"epoch": 0.4039460020768432,
"grad_norm": 0.4987678825855255,
"learning_rate": 8e-05,
"loss": 1.5857,
"step": 1945
},
{
"epoch": 0.40415368639667704,
"grad_norm": 0.5083807110786438,
"learning_rate": 8e-05,
"loss": 1.5437,
"step": 1946
},
{
"epoch": 0.4043613707165109,
"grad_norm": 0.5144785642623901,
"learning_rate": 8e-05,
"loss": 1.6153,
"step": 1947
},
{
"epoch": 0.40456905503634477,
"grad_norm": 0.4901933968067169,
"learning_rate": 8e-05,
"loss": 1.5634,
"step": 1948
},
{
"epoch": 0.4047767393561786,
"grad_norm": 0.47113847732543945,
"learning_rate": 8e-05,
"loss": 1.5647,
"step": 1949
},
{
"epoch": 0.40498442367601245,
"grad_norm": 0.4938381314277649,
"learning_rate": 8e-05,
"loss": 1.5668,
"step": 1950
},
{
"epoch": 0.4051921079958463,
"grad_norm": 0.5108892321586609,
"learning_rate": 8e-05,
"loss": 1.6338,
"step": 1951
},
{
"epoch": 0.4053997923156802,
"grad_norm": 0.46494653820991516,
"learning_rate": 8e-05,
"loss": 1.4997,
"step": 1952
},
{
"epoch": 0.405607476635514,
"grad_norm": 0.48271963000297546,
"learning_rate": 8e-05,
"loss": 1.5029,
"step": 1953
},
{
"epoch": 0.40581516095534786,
"grad_norm": 0.5015127658843994,
"learning_rate": 8e-05,
"loss": 1.5271,
"step": 1954
},
{
"epoch": 0.40602284527518173,
"grad_norm": 0.5078144669532776,
"learning_rate": 8e-05,
"loss": 1.5529,
"step": 1955
},
{
"epoch": 0.4062305295950156,
"grad_norm": 0.5320901870727539,
"learning_rate": 8e-05,
"loss": 1.6427,
"step": 1956
},
{
"epoch": 0.4064382139148494,
"grad_norm": 0.501735270023346,
"learning_rate": 8e-05,
"loss": 1.5977,
"step": 1957
},
{
"epoch": 0.4066458982346833,
"grad_norm": 0.4956776201725006,
"learning_rate": 8e-05,
"loss": 1.5942,
"step": 1958
},
{
"epoch": 0.40685358255451715,
"grad_norm": 0.48016679286956787,
"learning_rate": 8e-05,
"loss": 1.5697,
"step": 1959
},
{
"epoch": 0.407061266874351,
"grad_norm": 0.500160813331604,
"learning_rate": 8e-05,
"loss": 1.5724,
"step": 1960
},
{
"epoch": 0.4072689511941848,
"grad_norm": 0.5022308230400085,
"learning_rate": 8e-05,
"loss": 1.5612,
"step": 1961
},
{
"epoch": 0.4074766355140187,
"grad_norm": 0.5192114114761353,
"learning_rate": 8e-05,
"loss": 1.6205,
"step": 1962
},
{
"epoch": 0.40768431983385256,
"grad_norm": 0.470253050327301,
"learning_rate": 8e-05,
"loss": 1.5219,
"step": 1963
},
{
"epoch": 0.40789200415368637,
"grad_norm": 0.48591867089271545,
"learning_rate": 8e-05,
"loss": 1.6203,
"step": 1964
},
{
"epoch": 0.40809968847352024,
"grad_norm": 0.49583899974823,
"learning_rate": 8e-05,
"loss": 1.548,
"step": 1965
},
{
"epoch": 0.4083073727933541,
"grad_norm": 0.5171915292739868,
"learning_rate": 8e-05,
"loss": 1.6081,
"step": 1966
},
{
"epoch": 0.408515057113188,
"grad_norm": 0.49792858958244324,
"learning_rate": 8e-05,
"loss": 1.6061,
"step": 1967
},
{
"epoch": 0.4087227414330218,
"grad_norm": 0.4974616467952728,
"learning_rate": 8e-05,
"loss": 1.6322,
"step": 1968
},
{
"epoch": 0.40893042575285565,
"grad_norm": 0.5105950832366943,
"learning_rate": 8e-05,
"loss": 1.5612,
"step": 1969
},
{
"epoch": 0.4091381100726895,
"grad_norm": 0.49406614899635315,
"learning_rate": 8e-05,
"loss": 1.5887,
"step": 1970
},
{
"epoch": 0.4093457943925234,
"grad_norm": 0.4994363784790039,
"learning_rate": 8e-05,
"loss": 1.6257,
"step": 1971
},
{
"epoch": 0.4095534787123572,
"grad_norm": 0.5137128233909607,
"learning_rate": 8e-05,
"loss": 1.5861,
"step": 1972
},
{
"epoch": 0.40976116303219107,
"grad_norm": 0.47833117842674255,
"learning_rate": 8e-05,
"loss": 1.5661,
"step": 1973
},
{
"epoch": 0.40996884735202493,
"grad_norm": 0.5011354684829712,
"learning_rate": 8e-05,
"loss": 1.6508,
"step": 1974
},
{
"epoch": 0.4101765316718588,
"grad_norm": 0.49250781536102295,
"learning_rate": 8e-05,
"loss": 1.587,
"step": 1975
},
{
"epoch": 0.4103842159916926,
"grad_norm": 0.5081549882888794,
"learning_rate": 8e-05,
"loss": 1.5634,
"step": 1976
},
{
"epoch": 0.4105919003115265,
"grad_norm": 0.503896176815033,
"learning_rate": 8e-05,
"loss": 1.5714,
"step": 1977
},
{
"epoch": 0.41079958463136035,
"grad_norm": 0.5175586938858032,
"learning_rate": 8e-05,
"loss": 1.5938,
"step": 1978
},
{
"epoch": 0.41100726895119416,
"grad_norm": 0.4877651631832123,
"learning_rate": 8e-05,
"loss": 1.5255,
"step": 1979
},
{
"epoch": 0.411214953271028,
"grad_norm": 0.5077330470085144,
"learning_rate": 8e-05,
"loss": 1.5783,
"step": 1980
},
{
"epoch": 0.4114226375908619,
"grad_norm": 0.48941168189048767,
"learning_rate": 8e-05,
"loss": 1.6246,
"step": 1981
},
{
"epoch": 0.41163032191069576,
"grad_norm": 0.4887639582157135,
"learning_rate": 8e-05,
"loss": 1.581,
"step": 1982
},
{
"epoch": 0.4118380062305296,
"grad_norm": 0.48687008023262024,
"learning_rate": 8e-05,
"loss": 1.5368,
"step": 1983
},
{
"epoch": 0.41204569055036344,
"grad_norm": 0.4982350468635559,
"learning_rate": 8e-05,
"loss": 1.6293,
"step": 1984
},
{
"epoch": 0.4122533748701973,
"grad_norm": 0.48353442549705505,
"learning_rate": 8e-05,
"loss": 1.5929,
"step": 1985
},
{
"epoch": 0.4124610591900312,
"grad_norm": 0.47105276584625244,
"learning_rate": 8e-05,
"loss": 1.5741,
"step": 1986
},
{
"epoch": 0.412668743509865,
"grad_norm": 0.49714410305023193,
"learning_rate": 8e-05,
"loss": 1.5607,
"step": 1987
},
{
"epoch": 0.41287642782969886,
"grad_norm": 0.4911838173866272,
"learning_rate": 8e-05,
"loss": 1.5755,
"step": 1988
},
{
"epoch": 0.4130841121495327,
"grad_norm": 0.47256773710250854,
"learning_rate": 8e-05,
"loss": 1.4882,
"step": 1989
},
{
"epoch": 0.4132917964693666,
"grad_norm": 0.5255026817321777,
"learning_rate": 8e-05,
"loss": 1.586,
"step": 1990
},
{
"epoch": 0.4134994807892004,
"grad_norm": 0.48072895407676697,
"learning_rate": 8e-05,
"loss": 1.541,
"step": 1991
},
{
"epoch": 0.41370716510903427,
"grad_norm": 0.5017218589782715,
"learning_rate": 8e-05,
"loss": 1.6442,
"step": 1992
},
{
"epoch": 0.41391484942886814,
"grad_norm": 0.49725833535194397,
"learning_rate": 8e-05,
"loss": 1.6044,
"step": 1993
},
{
"epoch": 0.41412253374870195,
"grad_norm": 0.4846404790878296,
"learning_rate": 8e-05,
"loss": 1.5728,
"step": 1994
},
{
"epoch": 0.4143302180685358,
"grad_norm": 0.5051259398460388,
"learning_rate": 8e-05,
"loss": 1.5835,
"step": 1995
},
{
"epoch": 0.4145379023883697,
"grad_norm": 0.504957377910614,
"learning_rate": 8e-05,
"loss": 1.5677,
"step": 1996
},
{
"epoch": 0.41474558670820355,
"grad_norm": 0.5060474276542664,
"learning_rate": 8e-05,
"loss": 1.5864,
"step": 1997
},
{
"epoch": 0.41495327102803736,
"grad_norm": 0.5011793375015259,
"learning_rate": 8e-05,
"loss": 1.6073,
"step": 1998
},
{
"epoch": 0.41516095534787123,
"grad_norm": 0.5134116411209106,
"learning_rate": 8e-05,
"loss": 1.59,
"step": 1999
},
{
"epoch": 0.4153686396677051,
"grad_norm": 0.5138752460479736,
"learning_rate": 8e-05,
"loss": 1.5954,
"step": 2000
},
{
"epoch": 0.41557632398753896,
"grad_norm": 0.4684045612812042,
"learning_rate": 8e-05,
"loss": 1.4873,
"step": 2001
},
{
"epoch": 0.4157840083073728,
"grad_norm": 0.4988643527030945,
"learning_rate": 8e-05,
"loss": 1.5767,
"step": 2002
},
{
"epoch": 0.41599169262720664,
"grad_norm": 0.4997043311595917,
"learning_rate": 8e-05,
"loss": 1.5523,
"step": 2003
},
{
"epoch": 0.4161993769470405,
"grad_norm": 0.4968480169773102,
"learning_rate": 8e-05,
"loss": 1.5646,
"step": 2004
},
{
"epoch": 0.4164070612668744,
"grad_norm": 0.49053269624710083,
"learning_rate": 8e-05,
"loss": 1.5753,
"step": 2005
},
{
"epoch": 0.4166147455867082,
"grad_norm": 0.511942982673645,
"learning_rate": 8e-05,
"loss": 1.5919,
"step": 2006
},
{
"epoch": 0.41682242990654206,
"grad_norm": 0.4872995913028717,
"learning_rate": 8e-05,
"loss": 1.5418,
"step": 2007
},
{
"epoch": 0.4170301142263759,
"grad_norm": 0.4999676048755646,
"learning_rate": 8e-05,
"loss": 1.5057,
"step": 2008
},
{
"epoch": 0.41723779854620974,
"grad_norm": 0.5027379989624023,
"learning_rate": 8e-05,
"loss": 1.5947,
"step": 2009
},
{
"epoch": 0.4174454828660436,
"grad_norm": 0.5082154870033264,
"learning_rate": 8e-05,
"loss": 1.5025,
"step": 2010
},
{
"epoch": 0.41765316718587747,
"grad_norm": 0.5095928907394409,
"learning_rate": 8e-05,
"loss": 1.6436,
"step": 2011
},
{
"epoch": 0.41786085150571134,
"grad_norm": 0.5015538930892944,
"learning_rate": 8e-05,
"loss": 1.6064,
"step": 2012
},
{
"epoch": 0.41806853582554515,
"grad_norm": 0.5190447568893433,
"learning_rate": 8e-05,
"loss": 1.6239,
"step": 2013
},
{
"epoch": 0.418276220145379,
"grad_norm": 0.5272454023361206,
"learning_rate": 8e-05,
"loss": 1.5599,
"step": 2014
},
{
"epoch": 0.4184839044652129,
"grad_norm": 0.4809977412223816,
"learning_rate": 8e-05,
"loss": 1.5921,
"step": 2015
},
{
"epoch": 0.41869158878504675,
"grad_norm": 0.5302486419677734,
"learning_rate": 8e-05,
"loss": 1.6166,
"step": 2016
},
{
"epoch": 0.41889927310488057,
"grad_norm": 0.5640528202056885,
"learning_rate": 8e-05,
"loss": 1.5801,
"step": 2017
},
{
"epoch": 0.41910695742471443,
"grad_norm": 0.47891366481781006,
"learning_rate": 8e-05,
"loss": 1.5587,
"step": 2018
},
{
"epoch": 0.4193146417445483,
"grad_norm": 0.5397331714630127,
"learning_rate": 8e-05,
"loss": 1.5776,
"step": 2019
},
{
"epoch": 0.4195223260643821,
"grad_norm": 0.5956094861030579,
"learning_rate": 8e-05,
"loss": 1.6211,
"step": 2020
},
{
"epoch": 0.419730010384216,
"grad_norm": 0.5107740163803101,
"learning_rate": 8e-05,
"loss": 1.5543,
"step": 2021
},
{
"epoch": 0.41993769470404985,
"grad_norm": 0.5048316717147827,
"learning_rate": 8e-05,
"loss": 1.5948,
"step": 2022
},
{
"epoch": 0.4201453790238837,
"grad_norm": 0.526158332824707,
"learning_rate": 8e-05,
"loss": 1.5892,
"step": 2023
},
{
"epoch": 0.4203530633437175,
"grad_norm": 0.5123574733734131,
"learning_rate": 8e-05,
"loss": 1.5103,
"step": 2024
},
{
"epoch": 0.4205607476635514,
"grad_norm": 0.5256283283233643,
"learning_rate": 8e-05,
"loss": 1.6267,
"step": 2025
},
{
"epoch": 0.42076843198338526,
"grad_norm": 0.5114898681640625,
"learning_rate": 8e-05,
"loss": 1.6206,
"step": 2026
},
{
"epoch": 0.42097611630321913,
"grad_norm": 0.5819741487503052,
"learning_rate": 8e-05,
"loss": 1.5611,
"step": 2027
},
{
"epoch": 0.42118380062305294,
"grad_norm": 0.5962750315666199,
"learning_rate": 8e-05,
"loss": 1.5,
"step": 2028
},
{
"epoch": 0.4213914849428868,
"grad_norm": 0.5251797437667847,
"learning_rate": 8e-05,
"loss": 1.6389,
"step": 2029
},
{
"epoch": 0.4215991692627207,
"grad_norm": 0.48249077796936035,
"learning_rate": 8e-05,
"loss": 1.5256,
"step": 2030
},
{
"epoch": 0.42180685358255454,
"grad_norm": 0.5865249633789062,
"learning_rate": 8e-05,
"loss": 1.5666,
"step": 2031
},
{
"epoch": 0.42201453790238835,
"grad_norm": 0.5307789444923401,
"learning_rate": 8e-05,
"loss": 1.5807,
"step": 2032
},
{
"epoch": 0.4222222222222222,
"grad_norm": 0.5467637777328491,
"learning_rate": 8e-05,
"loss": 1.6559,
"step": 2033
},
{
"epoch": 0.4224299065420561,
"grad_norm": 0.4931851029396057,
"learning_rate": 8e-05,
"loss": 1.5186,
"step": 2034
},
{
"epoch": 0.4226375908618899,
"grad_norm": 0.5111013650894165,
"learning_rate": 8e-05,
"loss": 1.5595,
"step": 2035
},
{
"epoch": 0.42284527518172377,
"grad_norm": 0.5746096968650818,
"learning_rate": 8e-05,
"loss": 1.5242,
"step": 2036
},
{
"epoch": 0.42305295950155763,
"grad_norm": 0.49678611755371094,
"learning_rate": 8e-05,
"loss": 1.5794,
"step": 2037
},
{
"epoch": 0.4232606438213915,
"grad_norm": 0.5084017515182495,
"learning_rate": 8e-05,
"loss": 1.5751,
"step": 2038
},
{
"epoch": 0.4234683281412253,
"grad_norm": 0.48849454522132874,
"learning_rate": 8e-05,
"loss": 1.521,
"step": 2039
},
{
"epoch": 0.4236760124610592,
"grad_norm": 0.5270300507545471,
"learning_rate": 8e-05,
"loss": 1.6187,
"step": 2040
},
{
"epoch": 0.42388369678089305,
"grad_norm": 0.5154289603233337,
"learning_rate": 8e-05,
"loss": 1.5474,
"step": 2041
},
{
"epoch": 0.4240913811007269,
"grad_norm": 0.4959946870803833,
"learning_rate": 8e-05,
"loss": 1.5549,
"step": 2042
},
{
"epoch": 0.42429906542056073,
"grad_norm": 0.49153828620910645,
"learning_rate": 8e-05,
"loss": 1.5627,
"step": 2043
},
{
"epoch": 0.4245067497403946,
"grad_norm": 0.504295289516449,
"learning_rate": 8e-05,
"loss": 1.5422,
"step": 2044
},
{
"epoch": 0.42471443406022846,
"grad_norm": 0.4918515384197235,
"learning_rate": 8e-05,
"loss": 1.6011,
"step": 2045
},
{
"epoch": 0.42492211838006233,
"grad_norm": 0.5110564827919006,
"learning_rate": 8e-05,
"loss": 1.5277,
"step": 2046
},
{
"epoch": 0.42512980269989614,
"grad_norm": 0.49453458189964294,
"learning_rate": 8e-05,
"loss": 1.5887,
"step": 2047
},
{
"epoch": 0.42533748701973,
"grad_norm": 0.49117112159729004,
"learning_rate": 8e-05,
"loss": 1.5371,
"step": 2048
},
{
"epoch": 0.4255451713395639,
"grad_norm": 0.5056717991828918,
"learning_rate": 8e-05,
"loss": 1.6292,
"step": 2049
},
{
"epoch": 0.4257528556593977,
"grad_norm": 0.5276777148246765,
"learning_rate": 8e-05,
"loss": 1.5727,
"step": 2050
},
{
"epoch": 0.42596053997923156,
"grad_norm": 0.4825052320957184,
"learning_rate": 8e-05,
"loss": 1.5985,
"step": 2051
},
{
"epoch": 0.4261682242990654,
"grad_norm": 0.4818319082260132,
"learning_rate": 8e-05,
"loss": 1.5049,
"step": 2052
},
{
"epoch": 0.4263759086188993,
"grad_norm": 0.4871183931827545,
"learning_rate": 8e-05,
"loss": 1.5877,
"step": 2053
},
{
"epoch": 0.4265835929387331,
"grad_norm": 0.5131775140762329,
"learning_rate": 8e-05,
"loss": 1.6401,
"step": 2054
},
{
"epoch": 0.42679127725856697,
"grad_norm": 0.5020706057548523,
"learning_rate": 8e-05,
"loss": 1.5895,
"step": 2055
},
{
"epoch": 0.42699896157840084,
"grad_norm": 0.5350056886672974,
"learning_rate": 8e-05,
"loss": 1.584,
"step": 2056
},
{
"epoch": 0.4272066458982347,
"grad_norm": 0.5034728646278381,
"learning_rate": 8e-05,
"loss": 1.581,
"step": 2057
},
{
"epoch": 0.4274143302180685,
"grad_norm": 0.5005778670310974,
"learning_rate": 8e-05,
"loss": 1.5836,
"step": 2058
},
{
"epoch": 0.4276220145379024,
"grad_norm": 0.5032508969306946,
"learning_rate": 8e-05,
"loss": 1.6192,
"step": 2059
},
{
"epoch": 0.42782969885773625,
"grad_norm": 0.5014644861221313,
"learning_rate": 8e-05,
"loss": 1.5815,
"step": 2060
},
{
"epoch": 0.4280373831775701,
"grad_norm": 0.48992249369621277,
"learning_rate": 8e-05,
"loss": 1.5504,
"step": 2061
},
{
"epoch": 0.42824506749740393,
"grad_norm": 0.49682915210723877,
"learning_rate": 8e-05,
"loss": 1.5766,
"step": 2062
},
{
"epoch": 0.4284527518172378,
"grad_norm": 0.5146527290344238,
"learning_rate": 8e-05,
"loss": 1.6162,
"step": 2063
},
{
"epoch": 0.42866043613707167,
"grad_norm": 0.4989301562309265,
"learning_rate": 8e-05,
"loss": 1.603,
"step": 2064
},
{
"epoch": 0.4288681204569055,
"grad_norm": 0.5287513136863708,
"learning_rate": 8e-05,
"loss": 1.6206,
"step": 2065
},
{
"epoch": 0.42907580477673934,
"grad_norm": 0.5166513919830322,
"learning_rate": 8e-05,
"loss": 1.5177,
"step": 2066
},
{
"epoch": 0.4292834890965732,
"grad_norm": 0.52769935131073,
"learning_rate": 8e-05,
"loss": 1.5699,
"step": 2067
},
{
"epoch": 0.4294911734164071,
"grad_norm": 0.5170919895172119,
"learning_rate": 8e-05,
"loss": 1.6006,
"step": 2068
},
{
"epoch": 0.4296988577362409,
"grad_norm": 0.5070621371269226,
"learning_rate": 8e-05,
"loss": 1.6021,
"step": 2069
},
{
"epoch": 0.42990654205607476,
"grad_norm": 0.5316301584243774,
"learning_rate": 8e-05,
"loss": 1.5772,
"step": 2070
},
{
"epoch": 0.4301142263759086,
"grad_norm": 0.4892559349536896,
"learning_rate": 8e-05,
"loss": 1.5981,
"step": 2071
},
{
"epoch": 0.4303219106957425,
"grad_norm": 0.48384004831314087,
"learning_rate": 8e-05,
"loss": 1.5119,
"step": 2072
},
{
"epoch": 0.4305295950155763,
"grad_norm": 0.486844539642334,
"learning_rate": 8e-05,
"loss": 1.5741,
"step": 2073
},
{
"epoch": 0.4307372793354102,
"grad_norm": 0.5377826690673828,
"learning_rate": 8e-05,
"loss": 1.617,
"step": 2074
},
{
"epoch": 0.43094496365524404,
"grad_norm": 0.5149186253547668,
"learning_rate": 8e-05,
"loss": 1.553,
"step": 2075
},
{
"epoch": 0.4311526479750779,
"grad_norm": 0.4895002841949463,
"learning_rate": 8e-05,
"loss": 1.5779,
"step": 2076
},
{
"epoch": 0.4313603322949117,
"grad_norm": 0.48911821842193604,
"learning_rate": 8e-05,
"loss": 1.5364,
"step": 2077
},
{
"epoch": 0.4315680166147456,
"grad_norm": 0.49317991733551025,
"learning_rate": 8e-05,
"loss": 1.5906,
"step": 2078
},
{
"epoch": 0.43177570093457945,
"grad_norm": 0.5003262162208557,
"learning_rate": 8e-05,
"loss": 1.5706,
"step": 2079
},
{
"epoch": 0.43198338525441327,
"grad_norm": 0.5054447650909424,
"learning_rate": 8e-05,
"loss": 1.5691,
"step": 2080
},
{
"epoch": 0.43219106957424713,
"grad_norm": 0.4890398383140564,
"learning_rate": 8e-05,
"loss": 1.5654,
"step": 2081
},
{
"epoch": 0.432398753894081,
"grad_norm": 0.5025361180305481,
"learning_rate": 8e-05,
"loss": 1.5346,
"step": 2082
},
{
"epoch": 0.43260643821391487,
"grad_norm": 0.5030412673950195,
"learning_rate": 8e-05,
"loss": 1.5995,
"step": 2083
},
{
"epoch": 0.4328141225337487,
"grad_norm": 0.5007551908493042,
"learning_rate": 8e-05,
"loss": 1.6198,
"step": 2084
},
{
"epoch": 0.43302180685358255,
"grad_norm": 0.5120386481285095,
"learning_rate": 8e-05,
"loss": 1.5953,
"step": 2085
},
{
"epoch": 0.4332294911734164,
"grad_norm": 0.5004842281341553,
"learning_rate": 8e-05,
"loss": 1.5432,
"step": 2086
},
{
"epoch": 0.4334371754932503,
"grad_norm": 0.521942675113678,
"learning_rate": 8e-05,
"loss": 1.6336,
"step": 2087
},
{
"epoch": 0.4336448598130841,
"grad_norm": 0.494864284992218,
"learning_rate": 8e-05,
"loss": 1.5748,
"step": 2088
},
{
"epoch": 0.43385254413291796,
"grad_norm": 0.49404293298721313,
"learning_rate": 8e-05,
"loss": 1.6049,
"step": 2089
},
{
"epoch": 0.43406022845275183,
"grad_norm": 0.5624111294746399,
"learning_rate": 8e-05,
"loss": 1.5513,
"step": 2090
},
{
"epoch": 0.4342679127725857,
"grad_norm": 0.5119521021842957,
"learning_rate": 8e-05,
"loss": 1.5539,
"step": 2091
},
{
"epoch": 0.4344755970924195,
"grad_norm": 0.51494961977005,
"learning_rate": 8e-05,
"loss": 1.586,
"step": 2092
},
{
"epoch": 0.4346832814122534,
"grad_norm": 0.5104636549949646,
"learning_rate": 8e-05,
"loss": 1.6364,
"step": 2093
},
{
"epoch": 0.43489096573208724,
"grad_norm": 0.5149340629577637,
"learning_rate": 8e-05,
"loss": 1.5458,
"step": 2094
},
{
"epoch": 0.43509865005192105,
"grad_norm": 0.5006559491157532,
"learning_rate": 8e-05,
"loss": 1.6001,
"step": 2095
},
{
"epoch": 0.4353063343717549,
"grad_norm": 0.4876810908317566,
"learning_rate": 8e-05,
"loss": 1.5671,
"step": 2096
},
{
"epoch": 0.4355140186915888,
"grad_norm": 0.5194754600524902,
"learning_rate": 8e-05,
"loss": 1.5521,
"step": 2097
},
{
"epoch": 0.43572170301142266,
"grad_norm": 0.4904755651950836,
"learning_rate": 8e-05,
"loss": 1.5825,
"step": 2098
},
{
"epoch": 0.43592938733125647,
"grad_norm": 0.49247294664382935,
"learning_rate": 8e-05,
"loss": 1.4871,
"step": 2099
},
{
"epoch": 0.43613707165109034,
"grad_norm": 0.5013484358787537,
"learning_rate": 8e-05,
"loss": 1.5281,
"step": 2100
},
{
"epoch": 0.4363447559709242,
"grad_norm": 0.5134440064430237,
"learning_rate": 8e-05,
"loss": 1.596,
"step": 2101
},
{
"epoch": 0.43655244029075807,
"grad_norm": 0.5220896601676941,
"learning_rate": 8e-05,
"loss": 1.6232,
"step": 2102
},
{
"epoch": 0.4367601246105919,
"grad_norm": 0.49945664405822754,
"learning_rate": 8e-05,
"loss": 1.5805,
"step": 2103
},
{
"epoch": 0.43696780893042575,
"grad_norm": 0.5018790364265442,
"learning_rate": 8e-05,
"loss": 1.5935,
"step": 2104
},
{
"epoch": 0.4371754932502596,
"grad_norm": 0.5007161498069763,
"learning_rate": 8e-05,
"loss": 1.6203,
"step": 2105
},
{
"epoch": 0.4373831775700935,
"grad_norm": 0.5214696526527405,
"learning_rate": 8e-05,
"loss": 1.6249,
"step": 2106
},
{
"epoch": 0.4375908618899273,
"grad_norm": 0.5219676494598389,
"learning_rate": 8e-05,
"loss": 1.5492,
"step": 2107
},
{
"epoch": 0.43779854620976116,
"grad_norm": 0.5156602263450623,
"learning_rate": 8e-05,
"loss": 1.6083,
"step": 2108
},
{
"epoch": 0.43800623052959503,
"grad_norm": 0.534309446811676,
"learning_rate": 8e-05,
"loss": 1.7035,
"step": 2109
},
{
"epoch": 0.43821391484942884,
"grad_norm": 0.4978565275669098,
"learning_rate": 8e-05,
"loss": 1.5881,
"step": 2110
},
{
"epoch": 0.4384215991692627,
"grad_norm": 0.5232987403869629,
"learning_rate": 8e-05,
"loss": 1.6172,
"step": 2111
},
{
"epoch": 0.4386292834890966,
"grad_norm": 0.512504518032074,
"learning_rate": 8e-05,
"loss": 1.5703,
"step": 2112
},
{
"epoch": 0.43883696780893044,
"grad_norm": 0.47827327251434326,
"learning_rate": 8e-05,
"loss": 1.5624,
"step": 2113
},
{
"epoch": 0.43904465212876426,
"grad_norm": 0.49305495619773865,
"learning_rate": 8e-05,
"loss": 1.537,
"step": 2114
},
{
"epoch": 0.4392523364485981,
"grad_norm": 0.4876486361026764,
"learning_rate": 8e-05,
"loss": 1.5634,
"step": 2115
},
{
"epoch": 0.439460020768432,
"grad_norm": 0.5139252543449402,
"learning_rate": 8e-05,
"loss": 1.5554,
"step": 2116
},
{
"epoch": 0.43966770508826586,
"grad_norm": 0.5262313485145569,
"learning_rate": 8e-05,
"loss": 1.5863,
"step": 2117
},
{
"epoch": 0.43987538940809967,
"grad_norm": 0.527298092842102,
"learning_rate": 8e-05,
"loss": 1.5333,
"step": 2118
},
{
"epoch": 0.44008307372793354,
"grad_norm": 0.5263831615447998,
"learning_rate": 8e-05,
"loss": 1.6122,
"step": 2119
},
{
"epoch": 0.4402907580477674,
"grad_norm": 0.49422043561935425,
"learning_rate": 8e-05,
"loss": 1.5043,
"step": 2120
},
{
"epoch": 0.4404984423676013,
"grad_norm": 0.4889119565486908,
"learning_rate": 8e-05,
"loss": 1.5843,
"step": 2121
},
{
"epoch": 0.4407061266874351,
"grad_norm": 0.5255298614501953,
"learning_rate": 8e-05,
"loss": 1.6276,
"step": 2122
},
{
"epoch": 0.44091381100726895,
"grad_norm": 0.4965208172798157,
"learning_rate": 8e-05,
"loss": 1.6024,
"step": 2123
},
{
"epoch": 0.4411214953271028,
"grad_norm": 0.5215441584587097,
"learning_rate": 8e-05,
"loss": 1.6019,
"step": 2124
},
{
"epoch": 0.44132917964693663,
"grad_norm": 0.50166255235672,
"learning_rate": 8e-05,
"loss": 1.5099,
"step": 2125
},
{
"epoch": 0.4415368639667705,
"grad_norm": 0.48959848284721375,
"learning_rate": 8e-05,
"loss": 1.5324,
"step": 2126
},
{
"epoch": 0.44174454828660437,
"grad_norm": 0.496033638715744,
"learning_rate": 8e-05,
"loss": 1.6065,
"step": 2127
},
{
"epoch": 0.44195223260643823,
"grad_norm": 0.5017862319946289,
"learning_rate": 8e-05,
"loss": 1.5673,
"step": 2128
},
{
"epoch": 0.44215991692627205,
"grad_norm": 0.49096164107322693,
"learning_rate": 8e-05,
"loss": 1.6137,
"step": 2129
},
{
"epoch": 0.4423676012461059,
"grad_norm": 0.51192307472229,
"learning_rate": 8e-05,
"loss": 1.5664,
"step": 2130
},
{
"epoch": 0.4425752855659398,
"grad_norm": 0.5064907670021057,
"learning_rate": 8e-05,
"loss": 1.6066,
"step": 2131
},
{
"epoch": 0.44278296988577365,
"grad_norm": 0.4990641176700592,
"learning_rate": 8e-05,
"loss": 1.6383,
"step": 2132
},
{
"epoch": 0.44299065420560746,
"grad_norm": 0.5085825324058533,
"learning_rate": 8e-05,
"loss": 1.5627,
"step": 2133
},
{
"epoch": 0.4431983385254413,
"grad_norm": 0.5027173757553101,
"learning_rate": 8e-05,
"loss": 1.5873,
"step": 2134
},
{
"epoch": 0.4434060228452752,
"grad_norm": 0.48605379462242126,
"learning_rate": 8e-05,
"loss": 1.6316,
"step": 2135
},
{
"epoch": 0.44361370716510906,
"grad_norm": 0.5001710057258606,
"learning_rate": 8e-05,
"loss": 1.6056,
"step": 2136
},
{
"epoch": 0.4438213914849429,
"grad_norm": 0.5118576884269714,
"learning_rate": 8e-05,
"loss": 1.5986,
"step": 2137
},
{
"epoch": 0.44402907580477674,
"grad_norm": 0.5374376773834229,
"learning_rate": 8e-05,
"loss": 1.6955,
"step": 2138
},
{
"epoch": 0.4442367601246106,
"grad_norm": 0.5199704170227051,
"learning_rate": 8e-05,
"loss": 1.5547,
"step": 2139
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.5208665132522583,
"learning_rate": 8e-05,
"loss": 1.491,
"step": 2140
},
{
"epoch": 0.4446521287642783,
"grad_norm": 0.4891216456890106,
"learning_rate": 8e-05,
"loss": 1.5564,
"step": 2141
},
{
"epoch": 0.44485981308411215,
"grad_norm": 0.5020160675048828,
"learning_rate": 8e-05,
"loss": 1.6321,
"step": 2142
},
{
"epoch": 0.445067497403946,
"grad_norm": 0.49476900696754456,
"learning_rate": 8e-05,
"loss": 1.5914,
"step": 2143
},
{
"epoch": 0.44527518172377983,
"grad_norm": 0.49868083000183105,
"learning_rate": 8e-05,
"loss": 1.501,
"step": 2144
},
{
"epoch": 0.4454828660436137,
"grad_norm": 0.5365971922874451,
"learning_rate": 8e-05,
"loss": 1.6427,
"step": 2145
},
{
"epoch": 0.44569055036344757,
"grad_norm": 0.5179263353347778,
"learning_rate": 8e-05,
"loss": 1.5516,
"step": 2146
},
{
"epoch": 0.44589823468328144,
"grad_norm": 0.4961378872394562,
"learning_rate": 8e-05,
"loss": 1.487,
"step": 2147
},
{
"epoch": 0.44610591900311525,
"grad_norm": 0.4983571767807007,
"learning_rate": 8e-05,
"loss": 1.6078,
"step": 2148
},
{
"epoch": 0.4463136033229491,
"grad_norm": 0.5751625299453735,
"learning_rate": 8e-05,
"loss": 1.5492,
"step": 2149
},
{
"epoch": 0.446521287642783,
"grad_norm": 0.5159772634506226,
"learning_rate": 8e-05,
"loss": 1.6149,
"step": 2150
},
{
"epoch": 0.44672897196261685,
"grad_norm": 0.48897507786750793,
"learning_rate": 8e-05,
"loss": 1.621,
"step": 2151
},
{
"epoch": 0.44693665628245066,
"grad_norm": 0.544232964515686,
"learning_rate": 8e-05,
"loss": 1.5878,
"step": 2152
},
{
"epoch": 0.44714434060228453,
"grad_norm": 0.5140170454978943,
"learning_rate": 8e-05,
"loss": 1.6066,
"step": 2153
},
{
"epoch": 0.4473520249221184,
"grad_norm": 0.49031364917755127,
"learning_rate": 8e-05,
"loss": 1.5178,
"step": 2154
},
{
"epoch": 0.4475597092419522,
"grad_norm": 0.4932582378387451,
"learning_rate": 8e-05,
"loss": 1.5018,
"step": 2155
},
{
"epoch": 0.4477673935617861,
"grad_norm": 0.5137055516242981,
"learning_rate": 8e-05,
"loss": 1.6213,
"step": 2156
},
{
"epoch": 0.44797507788161994,
"grad_norm": 0.5106627941131592,
"learning_rate": 8e-05,
"loss": 1.5928,
"step": 2157
},
{
"epoch": 0.4481827622014538,
"grad_norm": 0.5080628395080566,
"learning_rate": 8e-05,
"loss": 1.5682,
"step": 2158
},
{
"epoch": 0.4483904465212876,
"grad_norm": 0.5078219771385193,
"learning_rate": 8e-05,
"loss": 1.6174,
"step": 2159
},
{
"epoch": 0.4485981308411215,
"grad_norm": 0.5262932777404785,
"learning_rate": 8e-05,
"loss": 1.536,
"step": 2160
},
{
"epoch": 0.44880581516095536,
"grad_norm": 0.5089083313941956,
"learning_rate": 8e-05,
"loss": 1.543,
"step": 2161
},
{
"epoch": 0.4490134994807892,
"grad_norm": 0.5383339524269104,
"learning_rate": 8e-05,
"loss": 1.6122,
"step": 2162
},
{
"epoch": 0.44922118380062304,
"grad_norm": 0.5017908215522766,
"learning_rate": 8e-05,
"loss": 1.5862,
"step": 2163
},
{
"epoch": 0.4494288681204569,
"grad_norm": 0.4958338439464569,
"learning_rate": 8e-05,
"loss": 1.5661,
"step": 2164
},
{
"epoch": 0.44963655244029077,
"grad_norm": 0.4877220094203949,
"learning_rate": 8e-05,
"loss": 1.5747,
"step": 2165
},
{
"epoch": 0.44984423676012464,
"grad_norm": 0.49807313084602356,
"learning_rate": 8e-05,
"loss": 1.5605,
"step": 2166
},
{
"epoch": 0.45005192107995845,
"grad_norm": 0.5027108788490295,
"learning_rate": 8e-05,
"loss": 1.523,
"step": 2167
},
{
"epoch": 0.4502596053997923,
"grad_norm": 0.492351233959198,
"learning_rate": 8e-05,
"loss": 1.506,
"step": 2168
},
{
"epoch": 0.4504672897196262,
"grad_norm": 0.4846895933151245,
"learning_rate": 8e-05,
"loss": 1.5205,
"step": 2169
},
{
"epoch": 0.45067497403946,
"grad_norm": 0.49987098574638367,
"learning_rate": 8e-05,
"loss": 1.5675,
"step": 2170
},
{
"epoch": 0.45088265835929386,
"grad_norm": 0.4977555274963379,
"learning_rate": 8e-05,
"loss": 1.6083,
"step": 2171
},
{
"epoch": 0.45109034267912773,
"grad_norm": 0.5156652927398682,
"learning_rate": 8e-05,
"loss": 1.6064,
"step": 2172
},
{
"epoch": 0.4512980269989616,
"grad_norm": 0.48706451058387756,
"learning_rate": 8e-05,
"loss": 1.563,
"step": 2173
},
{
"epoch": 0.4515057113187954,
"grad_norm": 0.5008767247200012,
"learning_rate": 8e-05,
"loss": 1.5815,
"step": 2174
},
{
"epoch": 0.4517133956386293,
"grad_norm": 0.4886937737464905,
"learning_rate": 8e-05,
"loss": 1.5869,
"step": 2175
},
{
"epoch": 0.45192107995846315,
"grad_norm": 0.5106097459793091,
"learning_rate": 8e-05,
"loss": 1.6086,
"step": 2176
},
{
"epoch": 0.452128764278297,
"grad_norm": 0.5486605763435364,
"learning_rate": 8e-05,
"loss": 1.5641,
"step": 2177
},
{
"epoch": 0.4523364485981308,
"grad_norm": 0.4942476749420166,
"learning_rate": 8e-05,
"loss": 1.5703,
"step": 2178
},
{
"epoch": 0.4525441329179647,
"grad_norm": 0.5139703750610352,
"learning_rate": 8e-05,
"loss": 1.5634,
"step": 2179
},
{
"epoch": 0.45275181723779856,
"grad_norm": 0.5057925581932068,
"learning_rate": 8e-05,
"loss": 1.5819,
"step": 2180
},
{
"epoch": 0.45295950155763237,
"grad_norm": 0.49567729234695435,
"learning_rate": 8e-05,
"loss": 1.5186,
"step": 2181
},
{
"epoch": 0.45316718587746624,
"grad_norm": 0.549098551273346,
"learning_rate": 8e-05,
"loss": 1.5975,
"step": 2182
},
{
"epoch": 0.4533748701973001,
"grad_norm": 0.51541668176651,
"learning_rate": 8e-05,
"loss": 1.669,
"step": 2183
},
{
"epoch": 0.453582554517134,
"grad_norm": 0.6035095453262329,
"learning_rate": 8e-05,
"loss": 1.6484,
"step": 2184
},
{
"epoch": 0.4537902388369678,
"grad_norm": 0.5210909843444824,
"learning_rate": 8e-05,
"loss": 1.6159,
"step": 2185
},
{
"epoch": 0.45399792315680165,
"grad_norm": 0.5404158234596252,
"learning_rate": 8e-05,
"loss": 1.4956,
"step": 2186
},
{
"epoch": 0.4542056074766355,
"grad_norm": 0.541645884513855,
"learning_rate": 8e-05,
"loss": 1.5535,
"step": 2187
},
{
"epoch": 0.4544132917964694,
"grad_norm": 0.4959733784198761,
"learning_rate": 8e-05,
"loss": 1.5454,
"step": 2188
},
{
"epoch": 0.4546209761163032,
"grad_norm": 0.5585200786590576,
"learning_rate": 8e-05,
"loss": 1.6141,
"step": 2189
},
{
"epoch": 0.45482866043613707,
"grad_norm": 0.5159070491790771,
"learning_rate": 8e-05,
"loss": 1.5757,
"step": 2190
},
{
"epoch": 0.45503634475597093,
"grad_norm": 0.5728627443313599,
"learning_rate": 8e-05,
"loss": 1.6568,
"step": 2191
},
{
"epoch": 0.4552440290758048,
"grad_norm": 0.517425000667572,
"learning_rate": 8e-05,
"loss": 1.483,
"step": 2192
},
{
"epoch": 0.4554517133956386,
"grad_norm": 0.503883421421051,
"learning_rate": 8e-05,
"loss": 1.5684,
"step": 2193
},
{
"epoch": 0.4556593977154725,
"grad_norm": 0.53370201587677,
"learning_rate": 8e-05,
"loss": 1.5402,
"step": 2194
},
{
"epoch": 0.45586708203530635,
"grad_norm": 0.5302833318710327,
"learning_rate": 8e-05,
"loss": 1.5462,
"step": 2195
},
{
"epoch": 0.45607476635514016,
"grad_norm": 0.531651496887207,
"learning_rate": 8e-05,
"loss": 1.5692,
"step": 2196
},
{
"epoch": 0.456282450674974,
"grad_norm": 0.5229583978652954,
"learning_rate": 8e-05,
"loss": 1.5554,
"step": 2197
},
{
"epoch": 0.4564901349948079,
"grad_norm": 0.4997648000717163,
"learning_rate": 8e-05,
"loss": 1.6515,
"step": 2198
},
{
"epoch": 0.45669781931464176,
"grad_norm": 0.5124852657318115,
"learning_rate": 8e-05,
"loss": 1.5617,
"step": 2199
},
{
"epoch": 0.4569055036344756,
"grad_norm": 0.5129377841949463,
"learning_rate": 8e-05,
"loss": 1.6057,
"step": 2200
},
{
"epoch": 0.45711318795430944,
"grad_norm": 0.5239273905754089,
"learning_rate": 8e-05,
"loss": 1.5661,
"step": 2201
},
{
"epoch": 0.4573208722741433,
"grad_norm": 0.54039466381073,
"learning_rate": 8e-05,
"loss": 1.526,
"step": 2202
},
{
"epoch": 0.4575285565939772,
"grad_norm": 0.5111913681030273,
"learning_rate": 8e-05,
"loss": 1.6207,
"step": 2203
},
{
"epoch": 0.457736240913811,
"grad_norm": 0.5141212940216064,
"learning_rate": 8e-05,
"loss": 1.5606,
"step": 2204
},
{
"epoch": 0.45794392523364486,
"grad_norm": 0.510006844997406,
"learning_rate": 8e-05,
"loss": 1.559,
"step": 2205
},
{
"epoch": 0.4581516095534787,
"grad_norm": 0.5465336441993713,
"learning_rate": 8e-05,
"loss": 1.5428,
"step": 2206
},
{
"epoch": 0.4583592938733126,
"grad_norm": 0.5228673815727234,
"learning_rate": 8e-05,
"loss": 1.6287,
"step": 2207
},
{
"epoch": 0.4585669781931464,
"grad_norm": 0.5198245644569397,
"learning_rate": 8e-05,
"loss": 1.5437,
"step": 2208
},
{
"epoch": 0.45877466251298027,
"grad_norm": 0.5053104758262634,
"learning_rate": 8e-05,
"loss": 1.6151,
"step": 2209
},
{
"epoch": 0.45898234683281414,
"grad_norm": 0.5879641771316528,
"learning_rate": 8e-05,
"loss": 1.5599,
"step": 2210
},
{
"epoch": 0.45919003115264795,
"grad_norm": 0.4976397752761841,
"learning_rate": 8e-05,
"loss": 1.5307,
"step": 2211
},
{
"epoch": 0.4593977154724818,
"grad_norm": 0.500487208366394,
"learning_rate": 8e-05,
"loss": 1.4737,
"step": 2212
},
{
"epoch": 0.4596053997923157,
"grad_norm": 0.5141959190368652,
"learning_rate": 8e-05,
"loss": 1.5645,
"step": 2213
},
{
"epoch": 0.45981308411214955,
"grad_norm": 0.5344618558883667,
"learning_rate": 8e-05,
"loss": 1.5177,
"step": 2214
},
{
"epoch": 0.46002076843198336,
"grad_norm": 0.5062136650085449,
"learning_rate": 8e-05,
"loss": 1.5476,
"step": 2215
},
{
"epoch": 0.46022845275181723,
"grad_norm": 0.5057580471038818,
"learning_rate": 8e-05,
"loss": 1.5666,
"step": 2216
},
{
"epoch": 0.4604361370716511,
"grad_norm": 0.5169433355331421,
"learning_rate": 8e-05,
"loss": 1.5551,
"step": 2217
},
{
"epoch": 0.46064382139148496,
"grad_norm": 0.5240709781646729,
"learning_rate": 8e-05,
"loss": 1.6269,
"step": 2218
},
{
"epoch": 0.4608515057113188,
"grad_norm": 0.520158052444458,
"learning_rate": 8e-05,
"loss": 1.612,
"step": 2219
},
{
"epoch": 0.46105919003115264,
"grad_norm": 0.5017967820167542,
"learning_rate": 8e-05,
"loss": 1.6067,
"step": 2220
},
{
"epoch": 0.4612668743509865,
"grad_norm": 0.4989418685436249,
"learning_rate": 8e-05,
"loss": 1.5608,
"step": 2221
},
{
"epoch": 0.4614745586708204,
"grad_norm": 0.5172755122184753,
"learning_rate": 8e-05,
"loss": 1.5421,
"step": 2222
},
{
"epoch": 0.4616822429906542,
"grad_norm": 0.5159169435501099,
"learning_rate": 8e-05,
"loss": 1.6023,
"step": 2223
},
{
"epoch": 0.46188992731048806,
"grad_norm": 0.5040188431739807,
"learning_rate": 8e-05,
"loss": 1.5828,
"step": 2224
},
{
"epoch": 0.4620976116303219,
"grad_norm": 0.5402005314826965,
"learning_rate": 8e-05,
"loss": 1.6258,
"step": 2225
},
{
"epoch": 0.46230529595015574,
"grad_norm": 0.5770882964134216,
"learning_rate": 8e-05,
"loss": 1.5726,
"step": 2226
},
{
"epoch": 0.4625129802699896,
"grad_norm": 0.5088592767715454,
"learning_rate": 8e-05,
"loss": 1.6046,
"step": 2227
},
{
"epoch": 0.46272066458982347,
"grad_norm": 0.5058512687683105,
"learning_rate": 8e-05,
"loss": 1.6188,
"step": 2228
},
{
"epoch": 0.46292834890965734,
"grad_norm": 0.5000367760658264,
"learning_rate": 8e-05,
"loss": 1.5724,
"step": 2229
},
{
"epoch": 0.46313603322949115,
"grad_norm": 0.5102644562721252,
"learning_rate": 8e-05,
"loss": 1.6051,
"step": 2230
},
{
"epoch": 0.463343717549325,
"grad_norm": 0.512116014957428,
"learning_rate": 8e-05,
"loss": 1.5647,
"step": 2231
},
{
"epoch": 0.4635514018691589,
"grad_norm": 0.49340537190437317,
"learning_rate": 8e-05,
"loss": 1.6017,
"step": 2232
},
{
"epoch": 0.46375908618899275,
"grad_norm": 0.4973028004169464,
"learning_rate": 8e-05,
"loss": 1.5594,
"step": 2233
},
{
"epoch": 0.46396677050882656,
"grad_norm": 0.5239974856376648,
"learning_rate": 8e-05,
"loss": 1.5847,
"step": 2234
},
{
"epoch": 0.46417445482866043,
"grad_norm": 0.4996567666530609,
"learning_rate": 8e-05,
"loss": 1.5939,
"step": 2235
},
{
"epoch": 0.4643821391484943,
"grad_norm": 0.5092349052429199,
"learning_rate": 8e-05,
"loss": 1.5328,
"step": 2236
},
{
"epoch": 0.46458982346832817,
"grad_norm": 0.5141004323959351,
"learning_rate": 8e-05,
"loss": 1.6054,
"step": 2237
},
{
"epoch": 0.464797507788162,
"grad_norm": 0.4950210154056549,
"learning_rate": 8e-05,
"loss": 1.5328,
"step": 2238
},
{
"epoch": 0.46500519210799585,
"grad_norm": 0.499602347612381,
"learning_rate": 8e-05,
"loss": 1.555,
"step": 2239
},
{
"epoch": 0.4652128764278297,
"grad_norm": 0.5248525142669678,
"learning_rate": 8e-05,
"loss": 1.6277,
"step": 2240
},
{
"epoch": 0.4654205607476635,
"grad_norm": 0.5104266405105591,
"learning_rate": 8e-05,
"loss": 1.5733,
"step": 2241
},
{
"epoch": 0.4656282450674974,
"grad_norm": 0.4947991669178009,
"learning_rate": 8e-05,
"loss": 1.5984,
"step": 2242
},
{
"epoch": 0.46583592938733126,
"grad_norm": 0.519220232963562,
"learning_rate": 8e-05,
"loss": 1.6088,
"step": 2243
},
{
"epoch": 0.4660436137071651,
"grad_norm": 0.4922308921813965,
"learning_rate": 8e-05,
"loss": 1.5143,
"step": 2244
},
{
"epoch": 0.46625129802699894,
"grad_norm": 0.5001751184463501,
"learning_rate": 8e-05,
"loss": 1.5317,
"step": 2245
},
{
"epoch": 0.4664589823468328,
"grad_norm": 0.5132100582122803,
"learning_rate": 8e-05,
"loss": 1.5765,
"step": 2246
},
{
"epoch": 0.4666666666666667,
"grad_norm": 0.4972493350505829,
"learning_rate": 8e-05,
"loss": 1.5546,
"step": 2247
},
{
"epoch": 0.46687435098650054,
"grad_norm": 0.5021941661834717,
"learning_rate": 8e-05,
"loss": 1.6312,
"step": 2248
},
{
"epoch": 0.46708203530633435,
"grad_norm": 0.48572269082069397,
"learning_rate": 8e-05,
"loss": 1.5355,
"step": 2249
},
{
"epoch": 0.4672897196261682,
"grad_norm": 0.5086265206336975,
"learning_rate": 8e-05,
"loss": 1.5432,
"step": 2250
},
{
"epoch": 0.4674974039460021,
"grad_norm": 0.5098671913146973,
"learning_rate": 8e-05,
"loss": 1.5045,
"step": 2251
},
{
"epoch": 0.46770508826583596,
"grad_norm": 0.5196540951728821,
"learning_rate": 8e-05,
"loss": 1.5743,
"step": 2252
},
{
"epoch": 0.46791277258566977,
"grad_norm": 0.5160411596298218,
"learning_rate": 8e-05,
"loss": 1.6001,
"step": 2253
},
{
"epoch": 0.46812045690550363,
"grad_norm": 0.5127061605453491,
"learning_rate": 8e-05,
"loss": 1.534,
"step": 2254
},
{
"epoch": 0.4683281412253375,
"grad_norm": 0.5143935084342957,
"learning_rate": 8e-05,
"loss": 1.5723,
"step": 2255
},
{
"epoch": 0.4685358255451713,
"grad_norm": 0.5065985918045044,
"learning_rate": 8e-05,
"loss": 1.5099,
"step": 2256
},
{
"epoch": 0.4687435098650052,
"grad_norm": 0.5203616619110107,
"learning_rate": 8e-05,
"loss": 1.5427,
"step": 2257
},
{
"epoch": 0.46895119418483905,
"grad_norm": 0.5806429386138916,
"learning_rate": 8e-05,
"loss": 1.6105,
"step": 2258
},
{
"epoch": 0.4691588785046729,
"grad_norm": 0.4942633807659149,
"learning_rate": 8e-05,
"loss": 1.5873,
"step": 2259
},
{
"epoch": 0.46936656282450673,
"grad_norm": 0.5223998427391052,
"learning_rate": 8e-05,
"loss": 1.6349,
"step": 2260
},
{
"epoch": 0.4695742471443406,
"grad_norm": 0.5162349343299866,
"learning_rate": 8e-05,
"loss": 1.5431,
"step": 2261
},
{
"epoch": 0.46978193146417446,
"grad_norm": 0.4979266822338104,
"learning_rate": 8e-05,
"loss": 1.5814,
"step": 2262
},
{
"epoch": 0.46998961578400833,
"grad_norm": 0.5179408192634583,
"learning_rate": 8e-05,
"loss": 1.6749,
"step": 2263
},
{
"epoch": 0.47019730010384214,
"grad_norm": 0.5000909566879272,
"learning_rate": 8e-05,
"loss": 1.5196,
"step": 2264
},
{
"epoch": 0.470404984423676,
"grad_norm": 0.4916440546512604,
"learning_rate": 8e-05,
"loss": 1.5558,
"step": 2265
},
{
"epoch": 0.4706126687435099,
"grad_norm": 0.5041160583496094,
"learning_rate": 8e-05,
"loss": 1.6098,
"step": 2266
},
{
"epoch": 0.47082035306334374,
"grad_norm": 0.4847199320793152,
"learning_rate": 8e-05,
"loss": 1.5478,
"step": 2267
},
{
"epoch": 0.47102803738317756,
"grad_norm": 0.5013338327407837,
"learning_rate": 8e-05,
"loss": 1.5531,
"step": 2268
},
{
"epoch": 0.4712357217030114,
"grad_norm": 0.49625855684280396,
"learning_rate": 8e-05,
"loss": 1.5985,
"step": 2269
},
{
"epoch": 0.4714434060228453,
"grad_norm": 0.5018194317817688,
"learning_rate": 8e-05,
"loss": 1.5717,
"step": 2270
},
{
"epoch": 0.4716510903426791,
"grad_norm": 0.5168567299842834,
"learning_rate": 8e-05,
"loss": 1.5784,
"step": 2271
},
{
"epoch": 0.47185877466251297,
"grad_norm": 0.49007734656333923,
"learning_rate": 8e-05,
"loss": 1.5197,
"step": 2272
},
{
"epoch": 0.47206645898234684,
"grad_norm": 0.4975670576095581,
"learning_rate": 8e-05,
"loss": 1.6233,
"step": 2273
},
{
"epoch": 0.4722741433021807,
"grad_norm": 0.5049093961715698,
"learning_rate": 8e-05,
"loss": 1.5514,
"step": 2274
},
{
"epoch": 0.4724818276220145,
"grad_norm": 0.5157124996185303,
"learning_rate": 8e-05,
"loss": 1.6597,
"step": 2275
},
{
"epoch": 0.4726895119418484,
"grad_norm": 0.5225481986999512,
"learning_rate": 8e-05,
"loss": 1.6546,
"step": 2276
},
{
"epoch": 0.47289719626168225,
"grad_norm": 0.507124125957489,
"learning_rate": 8e-05,
"loss": 1.6082,
"step": 2277
},
{
"epoch": 0.4731048805815161,
"grad_norm": 0.495510071516037,
"learning_rate": 8e-05,
"loss": 1.5596,
"step": 2278
},
{
"epoch": 0.47331256490134993,
"grad_norm": 0.5406957864761353,
"learning_rate": 8e-05,
"loss": 1.5302,
"step": 2279
},
{
"epoch": 0.4735202492211838,
"grad_norm": 0.5365445613861084,
"learning_rate": 8e-05,
"loss": 1.5712,
"step": 2280
},
{
"epoch": 0.47372793354101767,
"grad_norm": 0.4859418272972107,
"learning_rate": 8e-05,
"loss": 1.6174,
"step": 2281
},
{
"epoch": 0.47393561786085153,
"grad_norm": 0.502739429473877,
"learning_rate": 8e-05,
"loss": 1.5976,
"step": 2282
},
{
"epoch": 0.47414330218068534,
"grad_norm": 0.5150384902954102,
"learning_rate": 8e-05,
"loss": 1.5256,
"step": 2283
},
{
"epoch": 0.4743509865005192,
"grad_norm": 0.48385000228881836,
"learning_rate": 8e-05,
"loss": 1.5797,
"step": 2284
},
{
"epoch": 0.4745586708203531,
"grad_norm": 0.5246078372001648,
"learning_rate": 8e-05,
"loss": 1.6164,
"step": 2285
},
{
"epoch": 0.4747663551401869,
"grad_norm": 0.5253199934959412,
"learning_rate": 8e-05,
"loss": 1.5422,
"step": 2286
},
{
"epoch": 0.47497403946002076,
"grad_norm": 0.5063892006874084,
"learning_rate": 8e-05,
"loss": 1.5868,
"step": 2287
},
{
"epoch": 0.4751817237798546,
"grad_norm": 0.5025436878204346,
"learning_rate": 8e-05,
"loss": 1.5405,
"step": 2288
},
{
"epoch": 0.4753894080996885,
"grad_norm": 0.48187509179115295,
"learning_rate": 8e-05,
"loss": 1.4724,
"step": 2289
},
{
"epoch": 0.4755970924195223,
"grad_norm": 0.5353221893310547,
"learning_rate": 8e-05,
"loss": 1.5803,
"step": 2290
},
{
"epoch": 0.47580477673935617,
"grad_norm": 0.5231779217720032,
"learning_rate": 8e-05,
"loss": 1.6353,
"step": 2291
},
{
"epoch": 0.47601246105919004,
"grad_norm": 0.5240789651870728,
"learning_rate": 8e-05,
"loss": 1.5412,
"step": 2292
},
{
"epoch": 0.4762201453790239,
"grad_norm": 0.4947764277458191,
"learning_rate": 8e-05,
"loss": 1.5188,
"step": 2293
},
{
"epoch": 0.4764278296988577,
"grad_norm": 0.4879968464374542,
"learning_rate": 8e-05,
"loss": 1.5719,
"step": 2294
},
{
"epoch": 0.4766355140186916,
"grad_norm": 0.49523064494132996,
"learning_rate": 8e-05,
"loss": 1.5927,
"step": 2295
},
{
"epoch": 0.47684319833852545,
"grad_norm": 0.5027074217796326,
"learning_rate": 8e-05,
"loss": 1.5833,
"step": 2296
},
{
"epoch": 0.4770508826583593,
"grad_norm": 0.5084169507026672,
"learning_rate": 8e-05,
"loss": 1.596,
"step": 2297
},
{
"epoch": 0.47725856697819313,
"grad_norm": 0.51845782995224,
"learning_rate": 8e-05,
"loss": 1.5654,
"step": 2298
},
{
"epoch": 0.477466251298027,
"grad_norm": 0.49563729763031006,
"learning_rate": 8e-05,
"loss": 1.6486,
"step": 2299
},
{
"epoch": 0.47767393561786087,
"grad_norm": 0.510164737701416,
"learning_rate": 8e-05,
"loss": 1.6042,
"step": 2300
},
{
"epoch": 0.4778816199376947,
"grad_norm": 0.5058431029319763,
"learning_rate": 8e-05,
"loss": 1.5206,
"step": 2301
},
{
"epoch": 0.47808930425752855,
"grad_norm": 0.49267512559890747,
"learning_rate": 8e-05,
"loss": 1.5328,
"step": 2302
},
{
"epoch": 0.4782969885773624,
"grad_norm": 0.5097163915634155,
"learning_rate": 8e-05,
"loss": 1.5723,
"step": 2303
},
{
"epoch": 0.4785046728971963,
"grad_norm": 0.5137566328048706,
"learning_rate": 8e-05,
"loss": 1.5595,
"step": 2304
},
{
"epoch": 0.4787123572170301,
"grad_norm": 0.511559009552002,
"learning_rate": 8e-05,
"loss": 1.6153,
"step": 2305
},
{
"epoch": 0.47892004153686396,
"grad_norm": 0.49793705344200134,
"learning_rate": 8e-05,
"loss": 1.5774,
"step": 2306
},
{
"epoch": 0.47912772585669783,
"grad_norm": 0.5060765743255615,
"learning_rate": 8e-05,
"loss": 1.5523,
"step": 2307
},
{
"epoch": 0.4793354101765317,
"grad_norm": 0.49947306513786316,
"learning_rate": 8e-05,
"loss": 1.6676,
"step": 2308
},
{
"epoch": 0.4795430944963655,
"grad_norm": 0.49617889523506165,
"learning_rate": 8e-05,
"loss": 1.5915,
"step": 2309
},
{
"epoch": 0.4797507788161994,
"grad_norm": 0.5186249613761902,
"learning_rate": 8e-05,
"loss": 1.5919,
"step": 2310
},
{
"epoch": 0.47995846313603324,
"grad_norm": 0.5017900466918945,
"learning_rate": 8e-05,
"loss": 1.5352,
"step": 2311
},
{
"epoch": 0.4801661474558671,
"grad_norm": 0.5195544362068176,
"learning_rate": 8e-05,
"loss": 1.5829,
"step": 2312
},
{
"epoch": 0.4803738317757009,
"grad_norm": 0.5048125386238098,
"learning_rate": 8e-05,
"loss": 1.6061,
"step": 2313
},
{
"epoch": 0.4805815160955348,
"grad_norm": 0.5114527940750122,
"learning_rate": 8e-05,
"loss": 1.6474,
"step": 2314
},
{
"epoch": 0.48078920041536866,
"grad_norm": 0.50144362449646,
"learning_rate": 8e-05,
"loss": 1.6303,
"step": 2315
},
{
"epoch": 0.48099688473520247,
"grad_norm": 0.5169421434402466,
"learning_rate": 8e-05,
"loss": 1.5849,
"step": 2316
},
{
"epoch": 0.48120456905503634,
"grad_norm": 0.4925006031990051,
"learning_rate": 8e-05,
"loss": 1.5671,
"step": 2317
},
{
"epoch": 0.4814122533748702,
"grad_norm": 0.5097764134407043,
"learning_rate": 8e-05,
"loss": 1.4887,
"step": 2318
},
{
"epoch": 0.48161993769470407,
"grad_norm": 0.4904961585998535,
"learning_rate": 8e-05,
"loss": 1.5444,
"step": 2319
},
{
"epoch": 0.4818276220145379,
"grad_norm": 0.5130316615104675,
"learning_rate": 8e-05,
"loss": 1.6072,
"step": 2320
},
{
"epoch": 0.48203530633437175,
"grad_norm": 0.4950331747531891,
"learning_rate": 8e-05,
"loss": 1.5635,
"step": 2321
},
{
"epoch": 0.4822429906542056,
"grad_norm": 0.5047598481178284,
"learning_rate": 8e-05,
"loss": 1.5922,
"step": 2322
},
{
"epoch": 0.4824506749740395,
"grad_norm": 0.5059360265731812,
"learning_rate": 8e-05,
"loss": 1.5522,
"step": 2323
},
{
"epoch": 0.4826583592938733,
"grad_norm": 0.5029763579368591,
"learning_rate": 8e-05,
"loss": 1.6158,
"step": 2324
},
{
"epoch": 0.48286604361370716,
"grad_norm": 0.518887460231781,
"learning_rate": 8e-05,
"loss": 1.5916,
"step": 2325
},
{
"epoch": 0.48307372793354103,
"grad_norm": 0.5052504539489746,
"learning_rate": 8e-05,
"loss": 1.5139,
"step": 2326
},
{
"epoch": 0.4832814122533749,
"grad_norm": 0.5410659909248352,
"learning_rate": 8e-05,
"loss": 1.6304,
"step": 2327
},
{
"epoch": 0.4834890965732087,
"grad_norm": 0.5123754739761353,
"learning_rate": 8e-05,
"loss": 1.5425,
"step": 2328
},
{
"epoch": 0.4836967808930426,
"grad_norm": 0.5117024779319763,
"learning_rate": 8e-05,
"loss": 1.5976,
"step": 2329
},
{
"epoch": 0.48390446521287644,
"grad_norm": 0.5014856457710266,
"learning_rate": 8e-05,
"loss": 1.5196,
"step": 2330
},
{
"epoch": 0.48411214953271026,
"grad_norm": 0.5097591280937195,
"learning_rate": 8e-05,
"loss": 1.6369,
"step": 2331
},
{
"epoch": 0.4843198338525441,
"grad_norm": 0.5165501832962036,
"learning_rate": 8e-05,
"loss": 1.6051,
"step": 2332
},
{
"epoch": 0.484527518172378,
"grad_norm": 0.4933048486709595,
"learning_rate": 8e-05,
"loss": 1.5319,
"step": 2333
},
{
"epoch": 0.48473520249221186,
"grad_norm": 0.5414435267448425,
"learning_rate": 8e-05,
"loss": 1.5775,
"step": 2334
},
{
"epoch": 0.48494288681204567,
"grad_norm": 0.5097305178642273,
"learning_rate": 8e-05,
"loss": 1.5904,
"step": 2335
},
{
"epoch": 0.48515057113187954,
"grad_norm": 0.5298729538917542,
"learning_rate": 8e-05,
"loss": 1.5748,
"step": 2336
},
{
"epoch": 0.4853582554517134,
"grad_norm": 0.496655136346817,
"learning_rate": 8e-05,
"loss": 1.5137,
"step": 2337
},
{
"epoch": 0.4855659397715473,
"grad_norm": 0.575095534324646,
"learning_rate": 8e-05,
"loss": 1.5802,
"step": 2338
},
{
"epoch": 0.4857736240913811,
"grad_norm": 0.55838942527771,
"learning_rate": 8e-05,
"loss": 1.6481,
"step": 2339
},
{
"epoch": 0.48598130841121495,
"grad_norm": 0.5484670400619507,
"learning_rate": 8e-05,
"loss": 1.5421,
"step": 2340
},
{
"epoch": 0.4861889927310488,
"grad_norm": 0.49879464507102966,
"learning_rate": 8e-05,
"loss": 1.5479,
"step": 2341
},
{
"epoch": 0.48639667705088263,
"grad_norm": 0.514241099357605,
"learning_rate": 8e-05,
"loss": 1.6005,
"step": 2342
},
{
"epoch": 0.4866043613707165,
"grad_norm": 0.5414352416992188,
"learning_rate": 8e-05,
"loss": 1.5833,
"step": 2343
},
{
"epoch": 0.48681204569055037,
"grad_norm": 0.5201272368431091,
"learning_rate": 8e-05,
"loss": 1.6411,
"step": 2344
},
{
"epoch": 0.48701973001038423,
"grad_norm": 0.5231587290763855,
"learning_rate": 8e-05,
"loss": 1.5735,
"step": 2345
},
{
"epoch": 0.48722741433021804,
"grad_norm": 0.5916883945465088,
"learning_rate": 8e-05,
"loss": 1.654,
"step": 2346
},
{
"epoch": 0.4874350986500519,
"grad_norm": 0.4834756553173065,
"learning_rate": 8e-05,
"loss": 1.5024,
"step": 2347
},
{
"epoch": 0.4876427829698858,
"grad_norm": 0.5379783511161804,
"learning_rate": 8e-05,
"loss": 1.5899,
"step": 2348
},
{
"epoch": 0.48785046728971965,
"grad_norm": 0.5307522416114807,
"learning_rate": 8e-05,
"loss": 1.6059,
"step": 2349
},
{
"epoch": 0.48805815160955346,
"grad_norm": 0.5545095205307007,
"learning_rate": 8e-05,
"loss": 1.6131,
"step": 2350
},
{
"epoch": 0.4882658359293873,
"grad_norm": 0.5184333920478821,
"learning_rate": 8e-05,
"loss": 1.5489,
"step": 2351
},
{
"epoch": 0.4884735202492212,
"grad_norm": 0.5013905763626099,
"learning_rate": 8e-05,
"loss": 1.5485,
"step": 2352
},
{
"epoch": 0.48868120456905506,
"grad_norm": 0.5201506614685059,
"learning_rate": 8e-05,
"loss": 1.5989,
"step": 2353
},
{
"epoch": 0.4888888888888889,
"grad_norm": 0.6300234198570251,
"learning_rate": 8e-05,
"loss": 1.6314,
"step": 2354
},
{
"epoch": 0.48909657320872274,
"grad_norm": 0.5080524682998657,
"learning_rate": 8e-05,
"loss": 1.5786,
"step": 2355
},
{
"epoch": 0.4893042575285566,
"grad_norm": 0.5111988186836243,
"learning_rate": 8e-05,
"loss": 1.5494,
"step": 2356
},
{
"epoch": 0.4895119418483904,
"grad_norm": 0.48312148451805115,
"learning_rate": 8e-05,
"loss": 1.5011,
"step": 2357
},
{
"epoch": 0.4897196261682243,
"grad_norm": 0.5628437995910645,
"learning_rate": 8e-05,
"loss": 1.6562,
"step": 2358
},
{
"epoch": 0.48992731048805815,
"grad_norm": 0.4936966598033905,
"learning_rate": 8e-05,
"loss": 1.5238,
"step": 2359
},
{
"epoch": 0.490134994807892,
"grad_norm": 0.5063262581825256,
"learning_rate": 8e-05,
"loss": 1.5278,
"step": 2360
},
{
"epoch": 0.49034267912772583,
"grad_norm": 0.5016118884086609,
"learning_rate": 8e-05,
"loss": 1.5497,
"step": 2361
},
{
"epoch": 0.4905503634475597,
"grad_norm": 0.5366820096969604,
"learning_rate": 8e-05,
"loss": 1.5932,
"step": 2362
},
{
"epoch": 0.49075804776739357,
"grad_norm": 0.5510308146476746,
"learning_rate": 8e-05,
"loss": 1.5999,
"step": 2363
},
{
"epoch": 0.49096573208722744,
"grad_norm": 0.5173181891441345,
"learning_rate": 8e-05,
"loss": 1.5276,
"step": 2364
},
{
"epoch": 0.49117341640706125,
"grad_norm": 0.5081064105033875,
"learning_rate": 8e-05,
"loss": 1.5833,
"step": 2365
},
{
"epoch": 0.4913811007268951,
"grad_norm": 0.49907955527305603,
"learning_rate": 8e-05,
"loss": 1.5682,
"step": 2366
},
{
"epoch": 0.491588785046729,
"grad_norm": 0.501561164855957,
"learning_rate": 8e-05,
"loss": 1.6015,
"step": 2367
},
{
"epoch": 0.49179646936656285,
"grad_norm": 0.5394383668899536,
"learning_rate": 8e-05,
"loss": 1.5138,
"step": 2368
},
{
"epoch": 0.49200415368639666,
"grad_norm": 0.5115194320678711,
"learning_rate": 8e-05,
"loss": 1.5979,
"step": 2369
},
{
"epoch": 0.49221183800623053,
"grad_norm": 0.5246303081512451,
"learning_rate": 8e-05,
"loss": 1.6148,
"step": 2370
},
{
"epoch": 0.4924195223260644,
"grad_norm": 0.491183340549469,
"learning_rate": 8e-05,
"loss": 1.5987,
"step": 2371
},
{
"epoch": 0.4926272066458982,
"grad_norm": 0.5178416967391968,
"learning_rate": 8e-05,
"loss": 1.6278,
"step": 2372
},
{
"epoch": 0.4928348909657321,
"grad_norm": 0.5169252753257751,
"learning_rate": 8e-05,
"loss": 1.5663,
"step": 2373
},
{
"epoch": 0.49304257528556594,
"grad_norm": 0.48223575949668884,
"learning_rate": 8e-05,
"loss": 1.5656,
"step": 2374
},
{
"epoch": 0.4932502596053998,
"grad_norm": 0.5244314670562744,
"learning_rate": 8e-05,
"loss": 1.5936,
"step": 2375
},
{
"epoch": 0.4934579439252336,
"grad_norm": 0.5011839866638184,
"learning_rate": 8e-05,
"loss": 1.5836,
"step": 2376
},
{
"epoch": 0.4936656282450675,
"grad_norm": 0.5012759566307068,
"learning_rate": 8e-05,
"loss": 1.4722,
"step": 2377
},
{
"epoch": 0.49387331256490136,
"grad_norm": 0.5099151134490967,
"learning_rate": 8e-05,
"loss": 1.5995,
"step": 2378
},
{
"epoch": 0.4940809968847352,
"grad_norm": 0.5069243311882019,
"learning_rate": 8e-05,
"loss": 1.5511,
"step": 2379
},
{
"epoch": 0.49428868120456904,
"grad_norm": 0.48993173241615295,
"learning_rate": 8e-05,
"loss": 1.5201,
"step": 2380
},
{
"epoch": 0.4944963655244029,
"grad_norm": 0.5133856534957886,
"learning_rate": 8e-05,
"loss": 1.6163,
"step": 2381
},
{
"epoch": 0.49470404984423677,
"grad_norm": 0.48842230439186096,
"learning_rate": 8e-05,
"loss": 1.4761,
"step": 2382
},
{
"epoch": 0.49491173416407064,
"grad_norm": 0.5133230090141296,
"learning_rate": 8e-05,
"loss": 1.607,
"step": 2383
},
{
"epoch": 0.49511941848390445,
"grad_norm": 0.5300762057304382,
"learning_rate": 8e-05,
"loss": 1.6077,
"step": 2384
},
{
"epoch": 0.4953271028037383,
"grad_norm": 0.5309485793113708,
"learning_rate": 8e-05,
"loss": 1.6195,
"step": 2385
},
{
"epoch": 0.4955347871235722,
"grad_norm": 0.5198656320571899,
"learning_rate": 8e-05,
"loss": 1.586,
"step": 2386
},
{
"epoch": 0.495742471443406,
"grad_norm": 0.5134472846984863,
"learning_rate": 8e-05,
"loss": 1.5686,
"step": 2387
},
{
"epoch": 0.49595015576323986,
"grad_norm": 0.5401621460914612,
"learning_rate": 8e-05,
"loss": 1.5476,
"step": 2388
},
{
"epoch": 0.49615784008307373,
"grad_norm": 0.5133628249168396,
"learning_rate": 8e-05,
"loss": 1.5707,
"step": 2389
},
{
"epoch": 0.4963655244029076,
"grad_norm": 0.5219677686691284,
"learning_rate": 8e-05,
"loss": 1.5748,
"step": 2390
},
{
"epoch": 0.4965732087227414,
"grad_norm": 0.5132991671562195,
"learning_rate": 8e-05,
"loss": 1.5675,
"step": 2391
},
{
"epoch": 0.4967808930425753,
"grad_norm": 0.5127162933349609,
"learning_rate": 8e-05,
"loss": 1.5678,
"step": 2392
},
{
"epoch": 0.49698857736240915,
"grad_norm": 0.5062135457992554,
"learning_rate": 8e-05,
"loss": 1.5377,
"step": 2393
},
{
"epoch": 0.497196261682243,
"grad_norm": 0.5313796997070312,
"learning_rate": 8e-05,
"loss": 1.6169,
"step": 2394
},
{
"epoch": 0.4974039460020768,
"grad_norm": 0.5403634309768677,
"learning_rate": 8e-05,
"loss": 1.5679,
"step": 2395
},
{
"epoch": 0.4976116303219107,
"grad_norm": 0.5055514574050903,
"learning_rate": 8e-05,
"loss": 1.5609,
"step": 2396
},
{
"epoch": 0.49781931464174456,
"grad_norm": 0.4901556372642517,
"learning_rate": 8e-05,
"loss": 1.5681,
"step": 2397
},
{
"epoch": 0.4980269989615784,
"grad_norm": 0.5094218254089355,
"learning_rate": 8e-05,
"loss": 1.5921,
"step": 2398
},
{
"epoch": 0.49823468328141224,
"grad_norm": 0.5202116370201111,
"learning_rate": 8e-05,
"loss": 1.604,
"step": 2399
},
{
"epoch": 0.4984423676012461,
"grad_norm": 0.4881117343902588,
"learning_rate": 8e-05,
"loss": 1.4603,
"step": 2400
},
{
"epoch": 0.49865005192108,
"grad_norm": 0.5098052620887756,
"learning_rate": 8e-05,
"loss": 1.525,
"step": 2401
},
{
"epoch": 0.4988577362409138,
"grad_norm": 0.5244508385658264,
"learning_rate": 8e-05,
"loss": 1.632,
"step": 2402
},
{
"epoch": 0.49906542056074765,
"grad_norm": 0.5339807271957397,
"learning_rate": 8e-05,
"loss": 1.7082,
"step": 2403
},
{
"epoch": 0.4992731048805815,
"grad_norm": 0.5100862979888916,
"learning_rate": 8e-05,
"loss": 1.5351,
"step": 2404
},
{
"epoch": 0.4994807892004154,
"grad_norm": 0.5070978999137878,
"learning_rate": 8e-05,
"loss": 1.5406,
"step": 2405
},
{
"epoch": 0.4996884735202492,
"grad_norm": 0.514641523361206,
"learning_rate": 8e-05,
"loss": 1.6159,
"step": 2406
},
{
"epoch": 0.49989615784008307,
"grad_norm": 0.5069220662117004,
"learning_rate": 8e-05,
"loss": 1.6052,
"step": 2407
},
{
"epoch": 0.5001038421599169,
"grad_norm": 0.526537299156189,
"learning_rate": 8e-05,
"loss": 1.5635,
"step": 2408
},
{
"epoch": 0.5003115264797507,
"grad_norm": 0.5100792050361633,
"learning_rate": 8e-05,
"loss": 1.5645,
"step": 2409
},
{
"epoch": 0.5005192107995846,
"grad_norm": 0.4965640902519226,
"learning_rate": 8e-05,
"loss": 1.4884,
"step": 2410
},
{
"epoch": 0.5007268951194185,
"grad_norm": 0.5005423426628113,
"learning_rate": 8e-05,
"loss": 1.5408,
"step": 2411
},
{
"epoch": 0.5009345794392523,
"grad_norm": 0.5040135979652405,
"learning_rate": 8e-05,
"loss": 1.5669,
"step": 2412
},
{
"epoch": 0.5011422637590862,
"grad_norm": 0.4898592233657837,
"learning_rate": 8e-05,
"loss": 1.505,
"step": 2413
},
{
"epoch": 0.5013499480789201,
"grad_norm": 0.5353949666023254,
"learning_rate": 8e-05,
"loss": 1.5671,
"step": 2414
},
{
"epoch": 0.5015576323987538,
"grad_norm": 0.5377967953681946,
"learning_rate": 8e-05,
"loss": 1.6208,
"step": 2415
},
{
"epoch": 0.5017653167185877,
"grad_norm": 0.5099054574966431,
"learning_rate": 8e-05,
"loss": 1.5954,
"step": 2416
},
{
"epoch": 0.5019730010384216,
"grad_norm": 0.5304335355758667,
"learning_rate": 8e-05,
"loss": 1.5279,
"step": 2417
},
{
"epoch": 0.5021806853582554,
"grad_norm": 0.5465752482414246,
"learning_rate": 8e-05,
"loss": 1.595,
"step": 2418
},
{
"epoch": 0.5023883696780893,
"grad_norm": 0.49918806552886963,
"learning_rate": 8e-05,
"loss": 1.5669,
"step": 2419
},
{
"epoch": 0.5025960539979232,
"grad_norm": 0.5252723693847656,
"learning_rate": 8e-05,
"loss": 1.527,
"step": 2420
},
{
"epoch": 0.502803738317757,
"grad_norm": 0.5622281432151794,
"learning_rate": 8e-05,
"loss": 1.6202,
"step": 2421
},
{
"epoch": 0.5030114226375909,
"grad_norm": 0.5915151834487915,
"learning_rate": 8e-05,
"loss": 1.6003,
"step": 2422
},
{
"epoch": 0.5032191069574247,
"grad_norm": 0.5138158202171326,
"learning_rate": 8e-05,
"loss": 1.5566,
"step": 2423
},
{
"epoch": 0.5034267912772585,
"grad_norm": 0.5394647121429443,
"learning_rate": 8e-05,
"loss": 1.5688,
"step": 2424
},
{
"epoch": 0.5036344755970924,
"grad_norm": 0.5186442732810974,
"learning_rate": 8e-05,
"loss": 1.5793,
"step": 2425
},
{
"epoch": 0.5038421599169263,
"grad_norm": 0.5293716192245483,
"learning_rate": 8e-05,
"loss": 1.5513,
"step": 2426
},
{
"epoch": 0.5040498442367601,
"grad_norm": 0.524944007396698,
"learning_rate": 8e-05,
"loss": 1.5179,
"step": 2427
},
{
"epoch": 0.504257528556594,
"grad_norm": 0.5945055484771729,
"learning_rate": 8e-05,
"loss": 1.5778,
"step": 2428
},
{
"epoch": 0.5044652128764279,
"grad_norm": 0.5441060662269592,
"learning_rate": 8e-05,
"loss": 1.5757,
"step": 2429
},
{
"epoch": 0.5046728971962616,
"grad_norm": 0.5250440239906311,
"learning_rate": 8e-05,
"loss": 1.5304,
"step": 2430
},
{
"epoch": 0.5048805815160955,
"grad_norm": 0.5178036689758301,
"learning_rate": 8e-05,
"loss": 1.5371,
"step": 2431
},
{
"epoch": 0.5050882658359294,
"grad_norm": 0.5340495705604553,
"learning_rate": 8e-05,
"loss": 1.5492,
"step": 2432
},
{
"epoch": 0.5052959501557632,
"grad_norm": 0.5054112076759338,
"learning_rate": 8e-05,
"loss": 1.5761,
"step": 2433
},
{
"epoch": 0.5055036344755971,
"grad_norm": 0.5301451086997986,
"learning_rate": 8e-05,
"loss": 1.599,
"step": 2434
},
{
"epoch": 0.505711318795431,
"grad_norm": 0.5750793814659119,
"learning_rate": 8e-05,
"loss": 1.6637,
"step": 2435
},
{
"epoch": 0.5059190031152648,
"grad_norm": 0.5496008992195129,
"learning_rate": 8e-05,
"loss": 1.5936,
"step": 2436
},
{
"epoch": 0.5061266874350987,
"grad_norm": 0.5835946202278137,
"learning_rate": 8e-05,
"loss": 1.5705,
"step": 2437
},
{
"epoch": 0.5063343717549325,
"grad_norm": 0.5562037825584412,
"learning_rate": 8e-05,
"loss": 1.5426,
"step": 2438
},
{
"epoch": 0.5065420560747663,
"grad_norm": 0.49563848972320557,
"learning_rate": 8e-05,
"loss": 1.563,
"step": 2439
},
{
"epoch": 0.5067497403946002,
"grad_norm": 0.533659815788269,
"learning_rate": 8e-05,
"loss": 1.5853,
"step": 2440
},
{
"epoch": 0.5069574247144341,
"grad_norm": 0.5142748355865479,
"learning_rate": 8e-05,
"loss": 1.4993,
"step": 2441
},
{
"epoch": 0.5071651090342679,
"grad_norm": 0.5077499747276306,
"learning_rate": 8e-05,
"loss": 1.5803,
"step": 2442
},
{
"epoch": 0.5073727933541018,
"grad_norm": 0.5426058769226074,
"learning_rate": 8e-05,
"loss": 1.59,
"step": 2443
},
{
"epoch": 0.5075804776739357,
"grad_norm": 0.4893079996109009,
"learning_rate": 8e-05,
"loss": 1.4893,
"step": 2444
},
{
"epoch": 0.5077881619937694,
"grad_norm": 0.5184099078178406,
"learning_rate": 8e-05,
"loss": 1.5245,
"step": 2445
},
{
"epoch": 0.5079958463136033,
"grad_norm": 0.5037515759468079,
"learning_rate": 8e-05,
"loss": 1.482,
"step": 2446
},
{
"epoch": 0.5082035306334372,
"grad_norm": 0.5105049014091492,
"learning_rate": 8e-05,
"loss": 1.5913,
"step": 2447
},
{
"epoch": 0.508411214953271,
"grad_norm": 0.513841450214386,
"learning_rate": 8e-05,
"loss": 1.5031,
"step": 2448
},
{
"epoch": 0.5086188992731049,
"grad_norm": 0.5362356305122375,
"learning_rate": 8e-05,
"loss": 1.5876,
"step": 2449
},
{
"epoch": 0.5088265835929388,
"grad_norm": 0.5155436396598816,
"learning_rate": 8e-05,
"loss": 1.5723,
"step": 2450
},
{
"epoch": 0.5090342679127726,
"grad_norm": 0.5344365239143372,
"learning_rate": 8e-05,
"loss": 1.5596,
"step": 2451
},
{
"epoch": 0.5092419522326065,
"grad_norm": 0.5077399611473083,
"learning_rate": 8e-05,
"loss": 1.6037,
"step": 2452
},
{
"epoch": 0.5094496365524402,
"grad_norm": 0.5176429748535156,
"learning_rate": 8e-05,
"loss": 1.5707,
"step": 2453
},
{
"epoch": 0.5096573208722741,
"grad_norm": 0.5297008156776428,
"learning_rate": 8e-05,
"loss": 1.5247,
"step": 2454
},
{
"epoch": 0.509865005192108,
"grad_norm": 0.5335763096809387,
"learning_rate": 8e-05,
"loss": 1.5811,
"step": 2455
},
{
"epoch": 0.5100726895119418,
"grad_norm": 0.5081517100334167,
"learning_rate": 8e-05,
"loss": 1.5559,
"step": 2456
},
{
"epoch": 0.5102803738317757,
"grad_norm": 0.5149271488189697,
"learning_rate": 8e-05,
"loss": 1.5795,
"step": 2457
},
{
"epoch": 0.5104880581516096,
"grad_norm": 0.49932461977005005,
"learning_rate": 8e-05,
"loss": 1.5799,
"step": 2458
},
{
"epoch": 0.5106957424714434,
"grad_norm": 0.5233448147773743,
"learning_rate": 8e-05,
"loss": 1.5389,
"step": 2459
},
{
"epoch": 0.5109034267912772,
"grad_norm": 0.53509521484375,
"learning_rate": 8e-05,
"loss": 1.6398,
"step": 2460
},
{
"epoch": 0.5111111111111111,
"grad_norm": 0.551110565662384,
"learning_rate": 8e-05,
"loss": 1.535,
"step": 2461
},
{
"epoch": 0.5113187954309449,
"grad_norm": 0.5153088569641113,
"learning_rate": 8e-05,
"loss": 1.5462,
"step": 2462
},
{
"epoch": 0.5115264797507788,
"grad_norm": 0.5195600390434265,
"learning_rate": 8e-05,
"loss": 1.5478,
"step": 2463
},
{
"epoch": 0.5117341640706127,
"grad_norm": 0.5105565786361694,
"learning_rate": 8e-05,
"loss": 1.5332,
"step": 2464
},
{
"epoch": 0.5119418483904465,
"grad_norm": 0.5178881287574768,
"learning_rate": 8e-05,
"loss": 1.5375,
"step": 2465
},
{
"epoch": 0.5121495327102804,
"grad_norm": 0.5123568773269653,
"learning_rate": 8e-05,
"loss": 1.6039,
"step": 2466
},
{
"epoch": 0.5123572170301143,
"grad_norm": 0.512981653213501,
"learning_rate": 8e-05,
"loss": 1.5822,
"step": 2467
},
{
"epoch": 0.512564901349948,
"grad_norm": 0.5098233819007874,
"learning_rate": 8e-05,
"loss": 1.6099,
"step": 2468
},
{
"epoch": 0.5127725856697819,
"grad_norm": 0.5267816781997681,
"learning_rate": 8e-05,
"loss": 1.5593,
"step": 2469
},
{
"epoch": 0.5129802699896158,
"grad_norm": 0.517514705657959,
"learning_rate": 8e-05,
"loss": 1.5874,
"step": 2470
},
{
"epoch": 0.5131879543094496,
"grad_norm": 0.49412310123443604,
"learning_rate": 8e-05,
"loss": 1.5829,
"step": 2471
},
{
"epoch": 0.5133956386292835,
"grad_norm": 0.49666720628738403,
"learning_rate": 8e-05,
"loss": 1.5223,
"step": 2472
},
{
"epoch": 0.5136033229491174,
"grad_norm": 0.511103630065918,
"learning_rate": 8e-05,
"loss": 1.5971,
"step": 2473
},
{
"epoch": 0.5138110072689512,
"grad_norm": 0.49603182077407837,
"learning_rate": 8e-05,
"loss": 1.6282,
"step": 2474
},
{
"epoch": 0.514018691588785,
"grad_norm": 0.4902198016643524,
"learning_rate": 8e-05,
"loss": 1.6366,
"step": 2475
},
{
"epoch": 0.5142263759086189,
"grad_norm": 0.5027957558631897,
"learning_rate": 8e-05,
"loss": 1.5777,
"step": 2476
},
{
"epoch": 0.5144340602284527,
"grad_norm": 0.5001282095909119,
"learning_rate": 8e-05,
"loss": 1.4579,
"step": 2477
},
{
"epoch": 0.5146417445482866,
"grad_norm": 0.49658671021461487,
"learning_rate": 8e-05,
"loss": 1.5798,
"step": 2478
},
{
"epoch": 0.5148494288681205,
"grad_norm": 0.5023085474967957,
"learning_rate": 8e-05,
"loss": 1.544,
"step": 2479
},
{
"epoch": 0.5150571131879543,
"grad_norm": 0.5538403391838074,
"learning_rate": 8e-05,
"loss": 1.5892,
"step": 2480
},
{
"epoch": 0.5152647975077882,
"grad_norm": 0.5351791381835938,
"learning_rate": 8e-05,
"loss": 1.6152,
"step": 2481
},
{
"epoch": 0.5154724818276221,
"grad_norm": 0.5126028060913086,
"learning_rate": 8e-05,
"loss": 1.5706,
"step": 2482
},
{
"epoch": 0.5156801661474558,
"grad_norm": 0.509198009967804,
"learning_rate": 8e-05,
"loss": 1.5492,
"step": 2483
},
{
"epoch": 0.5158878504672897,
"grad_norm": 0.5237105488777161,
"learning_rate": 8e-05,
"loss": 1.5066,
"step": 2484
},
{
"epoch": 0.5160955347871236,
"grad_norm": 0.49990713596343994,
"learning_rate": 8e-05,
"loss": 1.5941,
"step": 2485
},
{
"epoch": 0.5163032191069574,
"grad_norm": 0.5106729865074158,
"learning_rate": 8e-05,
"loss": 1.5571,
"step": 2486
},
{
"epoch": 0.5165109034267913,
"grad_norm": 0.5249480605125427,
"learning_rate": 8e-05,
"loss": 1.5523,
"step": 2487
},
{
"epoch": 0.5167185877466252,
"grad_norm": 0.49067258834838867,
"learning_rate": 8e-05,
"loss": 1.6248,
"step": 2488
},
{
"epoch": 0.516926272066459,
"grad_norm": 0.5179197192192078,
"learning_rate": 8e-05,
"loss": 1.5817,
"step": 2489
},
{
"epoch": 0.5171339563862928,
"grad_norm": 0.5394155979156494,
"learning_rate": 8e-05,
"loss": 1.5518,
"step": 2490
},
{
"epoch": 0.5173416407061266,
"grad_norm": 0.49925529956817627,
"learning_rate": 8e-05,
"loss": 1.5646,
"step": 2491
},
{
"epoch": 0.5175493250259605,
"grad_norm": 0.5597730278968811,
"learning_rate": 8e-05,
"loss": 1.6009,
"step": 2492
},
{
"epoch": 0.5177570093457944,
"grad_norm": 0.5028907060623169,
"learning_rate": 8e-05,
"loss": 1.55,
"step": 2493
},
{
"epoch": 0.5179646936656283,
"grad_norm": 0.5173121690750122,
"learning_rate": 8e-05,
"loss": 1.5156,
"step": 2494
},
{
"epoch": 0.5181723779854621,
"grad_norm": 0.5130355954170227,
"learning_rate": 8e-05,
"loss": 1.6179,
"step": 2495
},
{
"epoch": 0.518380062305296,
"grad_norm": 0.492567241191864,
"learning_rate": 8e-05,
"loss": 1.5069,
"step": 2496
},
{
"epoch": 0.5185877466251299,
"grad_norm": 0.5323511958122253,
"learning_rate": 8e-05,
"loss": 1.5661,
"step": 2497
},
{
"epoch": 0.5187954309449636,
"grad_norm": 0.5106134414672852,
"learning_rate": 8e-05,
"loss": 1.5629,
"step": 2498
},
{
"epoch": 0.5190031152647975,
"grad_norm": 0.5243561863899231,
"learning_rate": 8e-05,
"loss": 1.5809,
"step": 2499
},
{
"epoch": 0.5192107995846313,
"grad_norm": 0.5350177884101868,
"learning_rate": 8e-05,
"loss": 1.5558,
"step": 2500
},
{
"epoch": 0.5194184839044652,
"grad_norm": 0.4913942217826843,
"learning_rate": 8e-05,
"loss": 1.5466,
"step": 2501
},
{
"epoch": 0.5196261682242991,
"grad_norm": 0.5095742344856262,
"learning_rate": 8e-05,
"loss": 1.5512,
"step": 2502
},
{
"epoch": 0.519833852544133,
"grad_norm": 0.5109645128250122,
"learning_rate": 8e-05,
"loss": 1.5481,
"step": 2503
},
{
"epoch": 0.5200415368639668,
"grad_norm": 0.5052945017814636,
"learning_rate": 8e-05,
"loss": 1.5722,
"step": 2504
},
{
"epoch": 0.5202492211838006,
"grad_norm": 0.5314079523086548,
"learning_rate": 8e-05,
"loss": 1.5591,
"step": 2505
},
{
"epoch": 0.5204569055036344,
"grad_norm": 0.5043820738792419,
"learning_rate": 8e-05,
"loss": 1.5327,
"step": 2506
},
{
"epoch": 0.5206645898234683,
"grad_norm": 0.5031879544258118,
"learning_rate": 8e-05,
"loss": 1.4327,
"step": 2507
},
{
"epoch": 0.5208722741433022,
"grad_norm": 0.5129706859588623,
"learning_rate": 8e-05,
"loss": 1.5805,
"step": 2508
},
{
"epoch": 0.521079958463136,
"grad_norm": 0.5188937187194824,
"learning_rate": 8e-05,
"loss": 1.5256,
"step": 2509
},
{
"epoch": 0.5212876427829699,
"grad_norm": 0.5003663897514343,
"learning_rate": 8e-05,
"loss": 1.536,
"step": 2510
},
{
"epoch": 0.5214953271028038,
"grad_norm": 0.5461739897727966,
"learning_rate": 8e-05,
"loss": 1.5779,
"step": 2511
},
{
"epoch": 0.5217030114226376,
"grad_norm": 0.5200935006141663,
"learning_rate": 8e-05,
"loss": 1.6121,
"step": 2512
},
{
"epoch": 0.5219106957424714,
"grad_norm": 0.5046533942222595,
"learning_rate": 8e-05,
"loss": 1.5157,
"step": 2513
},
{
"epoch": 0.5221183800623053,
"grad_norm": 0.5248345136642456,
"learning_rate": 8e-05,
"loss": 1.6406,
"step": 2514
},
{
"epoch": 0.5223260643821391,
"grad_norm": 0.5107331871986389,
"learning_rate": 8e-05,
"loss": 1.5285,
"step": 2515
},
{
"epoch": 0.522533748701973,
"grad_norm": 0.532707691192627,
"learning_rate": 8e-05,
"loss": 1.5672,
"step": 2516
},
{
"epoch": 0.5227414330218069,
"grad_norm": 0.5015535354614258,
"learning_rate": 8e-05,
"loss": 1.6222,
"step": 2517
},
{
"epoch": 0.5229491173416407,
"grad_norm": 0.49384546279907227,
"learning_rate": 8e-05,
"loss": 1.5419,
"step": 2518
},
{
"epoch": 0.5231568016614746,
"grad_norm": 0.5301195383071899,
"learning_rate": 8e-05,
"loss": 1.5981,
"step": 2519
},
{
"epoch": 0.5233644859813084,
"grad_norm": 0.5169888138771057,
"learning_rate": 8e-05,
"loss": 1.5472,
"step": 2520
},
{
"epoch": 0.5235721703011422,
"grad_norm": 0.5029063820838928,
"learning_rate": 8e-05,
"loss": 1.588,
"step": 2521
},
{
"epoch": 0.5237798546209761,
"grad_norm": 0.5058553814888,
"learning_rate": 8e-05,
"loss": 1.5353,
"step": 2522
},
{
"epoch": 0.52398753894081,
"grad_norm": 0.5162726640701294,
"learning_rate": 8e-05,
"loss": 1.5503,
"step": 2523
},
{
"epoch": 0.5241952232606438,
"grad_norm": 0.526066780090332,
"learning_rate": 8e-05,
"loss": 1.5831,
"step": 2524
},
{
"epoch": 0.5244029075804777,
"grad_norm": 0.5261390805244446,
"learning_rate": 8e-05,
"loss": 1.5507,
"step": 2525
},
{
"epoch": 0.5246105919003116,
"grad_norm": 0.515093207359314,
"learning_rate": 8e-05,
"loss": 1.539,
"step": 2526
},
{
"epoch": 0.5248182762201454,
"grad_norm": 0.5058615803718567,
"learning_rate": 8e-05,
"loss": 1.6147,
"step": 2527
},
{
"epoch": 0.5250259605399792,
"grad_norm": 0.5275096297264099,
"learning_rate": 8e-05,
"loss": 1.5747,
"step": 2528
},
{
"epoch": 0.525233644859813,
"grad_norm": 0.521035373210907,
"learning_rate": 8e-05,
"loss": 1.531,
"step": 2529
},
{
"epoch": 0.5254413291796469,
"grad_norm": 0.525314211845398,
"learning_rate": 8e-05,
"loss": 1.5806,
"step": 2530
},
{
"epoch": 0.5256490134994808,
"grad_norm": 0.5049483180046082,
"learning_rate": 8e-05,
"loss": 1.5496,
"step": 2531
},
{
"epoch": 0.5258566978193147,
"grad_norm": 0.5101779103279114,
"learning_rate": 8e-05,
"loss": 1.5434,
"step": 2532
},
{
"epoch": 0.5260643821391485,
"grad_norm": 0.5049511194229126,
"learning_rate": 8e-05,
"loss": 1.5017,
"step": 2533
},
{
"epoch": 0.5262720664589824,
"grad_norm": 0.5197418332099915,
"learning_rate": 8e-05,
"loss": 1.5678,
"step": 2534
},
{
"epoch": 0.5264797507788161,
"grad_norm": 0.5398335456848145,
"learning_rate": 8e-05,
"loss": 1.6633,
"step": 2535
},
{
"epoch": 0.52668743509865,
"grad_norm": 0.506309986114502,
"learning_rate": 8e-05,
"loss": 1.4991,
"step": 2536
},
{
"epoch": 0.5268951194184839,
"grad_norm": 0.515005886554718,
"learning_rate": 8e-05,
"loss": 1.5842,
"step": 2537
},
{
"epoch": 0.5271028037383177,
"grad_norm": 0.49161919951438904,
"learning_rate": 8e-05,
"loss": 1.5945,
"step": 2538
},
{
"epoch": 0.5273104880581516,
"grad_norm": 0.4888719320297241,
"learning_rate": 8e-05,
"loss": 1.5212,
"step": 2539
},
{
"epoch": 0.5275181723779855,
"grad_norm": 0.4999425709247589,
"learning_rate": 8e-05,
"loss": 1.5424,
"step": 2540
},
{
"epoch": 0.5277258566978194,
"grad_norm": 0.5267025232315063,
"learning_rate": 8e-05,
"loss": 1.5701,
"step": 2541
},
{
"epoch": 0.5279335410176532,
"grad_norm": 0.5228183269500732,
"learning_rate": 8e-05,
"loss": 1.5789,
"step": 2542
},
{
"epoch": 0.528141225337487,
"grad_norm": 0.5197039842605591,
"learning_rate": 8e-05,
"loss": 1.592,
"step": 2543
},
{
"epoch": 0.5283489096573208,
"grad_norm": 0.5103591084480286,
"learning_rate": 8e-05,
"loss": 1.5852,
"step": 2544
},
{
"epoch": 0.5285565939771547,
"grad_norm": 0.4849054515361786,
"learning_rate": 8e-05,
"loss": 1.5137,
"step": 2545
},
{
"epoch": 0.5287642782969886,
"grad_norm": 0.5198196172714233,
"learning_rate": 8e-05,
"loss": 1.5192,
"step": 2546
},
{
"epoch": 0.5289719626168224,
"grad_norm": 0.49438804388046265,
"learning_rate": 8e-05,
"loss": 1.5662,
"step": 2547
},
{
"epoch": 0.5291796469366563,
"grad_norm": 0.5067175626754761,
"learning_rate": 8e-05,
"loss": 1.5503,
"step": 2548
},
{
"epoch": 0.5293873312564902,
"grad_norm": 0.5209910273551941,
"learning_rate": 8e-05,
"loss": 1.5551,
"step": 2549
},
{
"epoch": 0.5295950155763239,
"grad_norm": 0.5711840391159058,
"learning_rate": 8e-05,
"loss": 1.6114,
"step": 2550
},
{
"epoch": 0.5298026998961578,
"grad_norm": 0.5297765135765076,
"learning_rate": 8e-05,
"loss": 1.4573,
"step": 2551
},
{
"epoch": 0.5300103842159917,
"grad_norm": 0.5275728106498718,
"learning_rate": 8e-05,
"loss": 1.5492,
"step": 2552
},
{
"epoch": 0.5302180685358255,
"grad_norm": 0.513297438621521,
"learning_rate": 8e-05,
"loss": 1.5549,
"step": 2553
},
{
"epoch": 0.5304257528556594,
"grad_norm": 0.5029946565628052,
"learning_rate": 8e-05,
"loss": 1.5527,
"step": 2554
},
{
"epoch": 0.5306334371754933,
"grad_norm": 0.56758713722229,
"learning_rate": 8e-05,
"loss": 1.6427,
"step": 2555
},
{
"epoch": 0.5308411214953271,
"grad_norm": 0.5123136639595032,
"learning_rate": 8e-05,
"loss": 1.561,
"step": 2556
},
{
"epoch": 0.531048805815161,
"grad_norm": 0.5428341627120972,
"learning_rate": 8e-05,
"loss": 1.6549,
"step": 2557
},
{
"epoch": 0.5312564901349948,
"grad_norm": 0.5145912766456604,
"learning_rate": 8e-05,
"loss": 1.6259,
"step": 2558
},
{
"epoch": 0.5314641744548286,
"grad_norm": 0.5414952635765076,
"learning_rate": 8e-05,
"loss": 1.5668,
"step": 2559
},
{
"epoch": 0.5316718587746625,
"grad_norm": 0.5508964657783508,
"learning_rate": 8e-05,
"loss": 1.4993,
"step": 2560
},
{
"epoch": 0.5318795430944964,
"grad_norm": 0.501332700252533,
"learning_rate": 8e-05,
"loss": 1.5632,
"step": 2561
},
{
"epoch": 0.5320872274143302,
"grad_norm": 0.4879615902900696,
"learning_rate": 8e-05,
"loss": 1.4787,
"step": 2562
},
{
"epoch": 0.5322949117341641,
"grad_norm": 0.5425028800964355,
"learning_rate": 8e-05,
"loss": 1.5681,
"step": 2563
},
{
"epoch": 0.532502596053998,
"grad_norm": 0.5095462203025818,
"learning_rate": 8e-05,
"loss": 1.543,
"step": 2564
},
{
"epoch": 0.5327102803738317,
"grad_norm": 0.5352701544761658,
"learning_rate": 8e-05,
"loss": 1.6086,
"step": 2565
},
{
"epoch": 0.5329179646936656,
"grad_norm": 0.4908084571361542,
"learning_rate": 8e-05,
"loss": 1.5646,
"step": 2566
},
{
"epoch": 0.5331256490134995,
"grad_norm": 0.551490306854248,
"learning_rate": 8e-05,
"loss": 1.5477,
"step": 2567
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.5218336582183838,
"learning_rate": 8e-05,
"loss": 1.5666,
"step": 2568
},
{
"epoch": 0.5335410176531672,
"grad_norm": 0.5142163634300232,
"learning_rate": 8e-05,
"loss": 1.6095,
"step": 2569
},
{
"epoch": 0.5337487019730011,
"grad_norm": 0.5136362910270691,
"learning_rate": 8e-05,
"loss": 1.5997,
"step": 2570
},
{
"epoch": 0.5339563862928349,
"grad_norm": 0.539950966835022,
"learning_rate": 8e-05,
"loss": 1.6219,
"step": 2571
},
{
"epoch": 0.5341640706126688,
"grad_norm": 0.5029425621032715,
"learning_rate": 8e-05,
"loss": 1.5646,
"step": 2572
},
{
"epoch": 0.5343717549325026,
"grad_norm": 0.562980055809021,
"learning_rate": 8e-05,
"loss": 1.5901,
"step": 2573
},
{
"epoch": 0.5345794392523364,
"grad_norm": 0.5117045044898987,
"learning_rate": 8e-05,
"loss": 1.5533,
"step": 2574
},
{
"epoch": 0.5347871235721703,
"grad_norm": 0.5418522953987122,
"learning_rate": 8e-05,
"loss": 1.5846,
"step": 2575
},
{
"epoch": 0.5349948078920042,
"grad_norm": 0.5144622921943665,
"learning_rate": 8e-05,
"loss": 1.6386,
"step": 2576
},
{
"epoch": 0.535202492211838,
"grad_norm": 0.5126820206642151,
"learning_rate": 8e-05,
"loss": 1.524,
"step": 2577
},
{
"epoch": 0.5354101765316719,
"grad_norm": 0.5278242826461792,
"learning_rate": 8e-05,
"loss": 1.6653,
"step": 2578
},
{
"epoch": 0.5356178608515058,
"grad_norm": 0.5213025212287903,
"learning_rate": 8e-05,
"loss": 1.5981,
"step": 2579
},
{
"epoch": 0.5358255451713395,
"grad_norm": 0.5076565146446228,
"learning_rate": 8e-05,
"loss": 1.5918,
"step": 2580
},
{
"epoch": 0.5360332294911734,
"grad_norm": 0.5242885947227478,
"learning_rate": 8e-05,
"loss": 1.5806,
"step": 2581
},
{
"epoch": 0.5362409138110072,
"grad_norm": 0.5121769905090332,
"learning_rate": 8e-05,
"loss": 1.4982,
"step": 2582
},
{
"epoch": 0.5364485981308411,
"grad_norm": 0.5083255767822266,
"learning_rate": 8e-05,
"loss": 1.5928,
"step": 2583
},
{
"epoch": 0.536656282450675,
"grad_norm": 0.4839821755886078,
"learning_rate": 8e-05,
"loss": 1.5292,
"step": 2584
},
{
"epoch": 0.5368639667705088,
"grad_norm": 0.5013673901557922,
"learning_rate": 8e-05,
"loss": 1.5556,
"step": 2585
},
{
"epoch": 0.5370716510903427,
"grad_norm": 0.5166984796524048,
"learning_rate": 8e-05,
"loss": 1.5705,
"step": 2586
},
{
"epoch": 0.5372793354101766,
"grad_norm": 0.5307886004447937,
"learning_rate": 8e-05,
"loss": 1.5807,
"step": 2587
},
{
"epoch": 0.5374870197300103,
"grad_norm": 0.5106934309005737,
"learning_rate": 8e-05,
"loss": 1.5918,
"step": 2588
},
{
"epoch": 0.5376947040498442,
"grad_norm": 0.5113598704338074,
"learning_rate": 8e-05,
"loss": 1.5927,
"step": 2589
},
{
"epoch": 0.5379023883696781,
"grad_norm": 0.5320143103599548,
"learning_rate": 8e-05,
"loss": 1.615,
"step": 2590
},
{
"epoch": 0.5381100726895119,
"grad_norm": 0.5382052063941956,
"learning_rate": 8e-05,
"loss": 1.5541,
"step": 2591
},
{
"epoch": 0.5383177570093458,
"grad_norm": 0.5445559620857239,
"learning_rate": 8e-05,
"loss": 1.5962,
"step": 2592
},
{
"epoch": 0.5385254413291797,
"grad_norm": 0.49818602204322815,
"learning_rate": 8e-05,
"loss": 1.5125,
"step": 2593
},
{
"epoch": 0.5387331256490135,
"grad_norm": 0.5041065812110901,
"learning_rate": 8e-05,
"loss": 1.5459,
"step": 2594
},
{
"epoch": 0.5389408099688473,
"grad_norm": 0.5130652189254761,
"learning_rate": 8e-05,
"loss": 1.5832,
"step": 2595
},
{
"epoch": 0.5391484942886812,
"grad_norm": 0.5492714047431946,
"learning_rate": 8e-05,
"loss": 1.5877,
"step": 2596
},
{
"epoch": 0.539356178608515,
"grad_norm": 0.4994000196456909,
"learning_rate": 8e-05,
"loss": 1.4821,
"step": 2597
},
{
"epoch": 0.5395638629283489,
"grad_norm": 0.529475748538971,
"learning_rate": 8e-05,
"loss": 1.6487,
"step": 2598
},
{
"epoch": 0.5397715472481828,
"grad_norm": 0.5280352830886841,
"learning_rate": 8e-05,
"loss": 1.5581,
"step": 2599
},
{
"epoch": 0.5399792315680166,
"grad_norm": 0.5195119380950928,
"learning_rate": 8e-05,
"loss": 1.5381,
"step": 2600
},
{
"epoch": 0.5401869158878505,
"grad_norm": 0.5004687905311584,
"learning_rate": 8e-05,
"loss": 1.4899,
"step": 2601
},
{
"epoch": 0.5403946002076844,
"grad_norm": 0.5243152976036072,
"learning_rate": 8e-05,
"loss": 1.6188,
"step": 2602
},
{
"epoch": 0.5406022845275181,
"grad_norm": 0.49481499195098877,
"learning_rate": 8e-05,
"loss": 1.5984,
"step": 2603
},
{
"epoch": 0.540809968847352,
"grad_norm": 0.4946075975894928,
"learning_rate": 8e-05,
"loss": 1.5462,
"step": 2604
},
{
"epoch": 0.5410176531671859,
"grad_norm": 0.5581692457199097,
"learning_rate": 8e-05,
"loss": 1.5007,
"step": 2605
},
{
"epoch": 0.5412253374870197,
"grad_norm": 0.514069676399231,
"learning_rate": 8e-05,
"loss": 1.5571,
"step": 2606
},
{
"epoch": 0.5414330218068536,
"grad_norm": 0.5100674629211426,
"learning_rate": 8e-05,
"loss": 1.5248,
"step": 2607
},
{
"epoch": 0.5416407061266875,
"grad_norm": 0.5168145895004272,
"learning_rate": 8e-05,
"loss": 1.5596,
"step": 2608
},
{
"epoch": 0.5418483904465213,
"grad_norm": 0.5216502547264099,
"learning_rate": 8e-05,
"loss": 1.5369,
"step": 2609
},
{
"epoch": 0.5420560747663551,
"grad_norm": 0.5220406651496887,
"learning_rate": 8e-05,
"loss": 1.6143,
"step": 2610
},
{
"epoch": 0.542263759086189,
"grad_norm": 0.5106246471405029,
"learning_rate": 8e-05,
"loss": 1.5871,
"step": 2611
},
{
"epoch": 0.5424714434060228,
"grad_norm": 0.5054118633270264,
"learning_rate": 8e-05,
"loss": 1.4816,
"step": 2612
},
{
"epoch": 0.5426791277258567,
"grad_norm": 0.5267699956893921,
"learning_rate": 8e-05,
"loss": 1.5261,
"step": 2613
},
{
"epoch": 0.5428868120456906,
"grad_norm": 0.5472517609596252,
"learning_rate": 8e-05,
"loss": 1.6014,
"step": 2614
},
{
"epoch": 0.5430944963655244,
"grad_norm": 0.5042798519134521,
"learning_rate": 8e-05,
"loss": 1.5761,
"step": 2615
},
{
"epoch": 0.5433021806853583,
"grad_norm": 0.5117297172546387,
"learning_rate": 8e-05,
"loss": 1.5245,
"step": 2616
},
{
"epoch": 0.5435098650051922,
"grad_norm": 0.6044188737869263,
"learning_rate": 8e-05,
"loss": 1.571,
"step": 2617
},
{
"epoch": 0.5437175493250259,
"grad_norm": 0.5296552777290344,
"learning_rate": 8e-05,
"loss": 1.5735,
"step": 2618
},
{
"epoch": 0.5439252336448598,
"grad_norm": 0.5351176261901855,
"learning_rate": 8e-05,
"loss": 1.5473,
"step": 2619
},
{
"epoch": 0.5441329179646937,
"grad_norm": 0.5096343755722046,
"learning_rate": 8e-05,
"loss": 1.549,
"step": 2620
},
{
"epoch": 0.5443406022845275,
"grad_norm": 0.6275840401649475,
"learning_rate": 8e-05,
"loss": 1.5542,
"step": 2621
},
{
"epoch": 0.5445482866043614,
"grad_norm": 0.5917977094650269,
"learning_rate": 8e-05,
"loss": 1.5259,
"step": 2622
},
{
"epoch": 0.5447559709241953,
"grad_norm": 0.5169813632965088,
"learning_rate": 8e-05,
"loss": 1.5327,
"step": 2623
},
{
"epoch": 0.5449636552440291,
"grad_norm": 0.5194140076637268,
"learning_rate": 8e-05,
"loss": 1.6058,
"step": 2624
},
{
"epoch": 0.5451713395638629,
"grad_norm": 0.5479727387428284,
"learning_rate": 8e-05,
"loss": 1.5855,
"step": 2625
},
{
"epoch": 0.5453790238836967,
"grad_norm": 0.5423098206520081,
"learning_rate": 8e-05,
"loss": 1.58,
"step": 2626
},
{
"epoch": 0.5455867082035306,
"grad_norm": 0.5254849195480347,
"learning_rate": 8e-05,
"loss": 1.5466,
"step": 2627
},
{
"epoch": 0.5457943925233645,
"grad_norm": 0.5386984944343567,
"learning_rate": 8e-05,
"loss": 1.5583,
"step": 2628
},
{
"epoch": 0.5460020768431983,
"grad_norm": 0.5074415802955627,
"learning_rate": 8e-05,
"loss": 1.5341,
"step": 2629
},
{
"epoch": 0.5462097611630322,
"grad_norm": 0.5718663334846497,
"learning_rate": 8e-05,
"loss": 1.6086,
"step": 2630
},
{
"epoch": 0.5464174454828661,
"grad_norm": 0.5871424674987793,
"learning_rate": 8e-05,
"loss": 1.5826,
"step": 2631
},
{
"epoch": 0.5466251298027,
"grad_norm": 0.5480154156684875,
"learning_rate": 8e-05,
"loss": 1.6565,
"step": 2632
},
{
"epoch": 0.5468328141225337,
"grad_norm": 0.48676350712776184,
"learning_rate": 8e-05,
"loss": 1.5143,
"step": 2633
},
{
"epoch": 0.5470404984423676,
"grad_norm": 0.5588537454605103,
"learning_rate": 8e-05,
"loss": 1.5561,
"step": 2634
},
{
"epoch": 0.5472481827622014,
"grad_norm": 0.5413640141487122,
"learning_rate": 8e-05,
"loss": 1.6585,
"step": 2635
},
{
"epoch": 0.5474558670820353,
"grad_norm": 0.49471515417099,
"learning_rate": 8e-05,
"loss": 1.4957,
"step": 2636
},
{
"epoch": 0.5476635514018692,
"grad_norm": 0.5229868292808533,
"learning_rate": 8e-05,
"loss": 1.5581,
"step": 2637
},
{
"epoch": 0.547871235721703,
"grad_norm": 0.5099173188209534,
"learning_rate": 8e-05,
"loss": 1.5943,
"step": 2638
},
{
"epoch": 0.5480789200415369,
"grad_norm": 0.5230522751808167,
"learning_rate": 8e-05,
"loss": 1.5662,
"step": 2639
},
{
"epoch": 0.5482866043613707,
"grad_norm": 0.5060945749282837,
"learning_rate": 8e-05,
"loss": 1.5591,
"step": 2640
},
{
"epoch": 0.5484942886812045,
"grad_norm": 0.5048191547393799,
"learning_rate": 8e-05,
"loss": 1.5547,
"step": 2641
},
{
"epoch": 0.5487019730010384,
"grad_norm": 0.49480634927749634,
"learning_rate": 8e-05,
"loss": 1.5281,
"step": 2642
},
{
"epoch": 0.5489096573208723,
"grad_norm": 0.503041684627533,
"learning_rate": 8e-05,
"loss": 1.5378,
"step": 2643
},
{
"epoch": 0.5491173416407061,
"grad_norm": 0.5225034952163696,
"learning_rate": 8e-05,
"loss": 1.4993,
"step": 2644
},
{
"epoch": 0.54932502596054,
"grad_norm": 0.5275830030441284,
"learning_rate": 8e-05,
"loss": 1.6099,
"step": 2645
},
{
"epoch": 0.5495327102803739,
"grad_norm": 0.5186808109283447,
"learning_rate": 8e-05,
"loss": 1.605,
"step": 2646
},
{
"epoch": 0.5497403946002077,
"grad_norm": 0.5072364211082458,
"learning_rate": 8e-05,
"loss": 1.5667,
"step": 2647
},
{
"epoch": 0.5499480789200415,
"grad_norm": 0.5087660551071167,
"learning_rate": 8e-05,
"loss": 1.5476,
"step": 2648
},
{
"epoch": 0.5501557632398754,
"grad_norm": 0.5886751413345337,
"learning_rate": 8e-05,
"loss": 1.6084,
"step": 2649
},
{
"epoch": 0.5503634475597092,
"grad_norm": 0.5000873804092407,
"learning_rate": 8e-05,
"loss": 1.5246,
"step": 2650
},
{
"epoch": 0.5505711318795431,
"grad_norm": 0.5032468438148499,
"learning_rate": 8e-05,
"loss": 1.5161,
"step": 2651
},
{
"epoch": 0.550778816199377,
"grad_norm": 0.5816187262535095,
"learning_rate": 8e-05,
"loss": 1.6107,
"step": 2652
},
{
"epoch": 0.5509865005192108,
"grad_norm": 0.5264134407043457,
"learning_rate": 8e-05,
"loss": 1.6337,
"step": 2653
},
{
"epoch": 0.5511941848390447,
"grad_norm": 0.5257280468940735,
"learning_rate": 8e-05,
"loss": 1.6287,
"step": 2654
},
{
"epoch": 0.5514018691588785,
"grad_norm": 0.47970205545425415,
"learning_rate": 8e-05,
"loss": 1.4825,
"step": 2655
},
{
"epoch": 0.5516095534787123,
"grad_norm": 0.4971425235271454,
"learning_rate": 8e-05,
"loss": 1.5492,
"step": 2656
},
{
"epoch": 0.5518172377985462,
"grad_norm": 0.5874940752983093,
"learning_rate": 8e-05,
"loss": 1.5363,
"step": 2657
},
{
"epoch": 0.5520249221183801,
"grad_norm": 0.5532630085945129,
"learning_rate": 8e-05,
"loss": 1.5289,
"step": 2658
},
{
"epoch": 0.5522326064382139,
"grad_norm": 0.5012029409408569,
"learning_rate": 8e-05,
"loss": 1.592,
"step": 2659
},
{
"epoch": 0.5524402907580478,
"grad_norm": 0.5123198628425598,
"learning_rate": 8e-05,
"loss": 1.5494,
"step": 2660
},
{
"epoch": 0.5526479750778817,
"grad_norm": 0.521674394607544,
"learning_rate": 8e-05,
"loss": 1.5383,
"step": 2661
},
{
"epoch": 0.5528556593977155,
"grad_norm": 0.5289007425308228,
"learning_rate": 8e-05,
"loss": 1.542,
"step": 2662
},
{
"epoch": 0.5530633437175493,
"grad_norm": 0.5011623501777649,
"learning_rate": 8e-05,
"loss": 1.5637,
"step": 2663
},
{
"epoch": 0.5532710280373832,
"grad_norm": 0.5280729532241821,
"learning_rate": 8e-05,
"loss": 1.5499,
"step": 2664
},
{
"epoch": 0.553478712357217,
"grad_norm": 0.5130934715270996,
"learning_rate": 8e-05,
"loss": 1.596,
"step": 2665
},
{
"epoch": 0.5536863966770509,
"grad_norm": 0.5125265717506409,
"learning_rate": 8e-05,
"loss": 1.5102,
"step": 2666
},
{
"epoch": 0.5538940809968848,
"grad_norm": 0.5373044610023499,
"learning_rate": 8e-05,
"loss": 1.5319,
"step": 2667
},
{
"epoch": 0.5541017653167186,
"grad_norm": 0.5123081207275391,
"learning_rate": 8e-05,
"loss": 1.5356,
"step": 2668
},
{
"epoch": 0.5543094496365525,
"grad_norm": 0.5117352604866028,
"learning_rate": 8e-05,
"loss": 1.5098,
"step": 2669
},
{
"epoch": 0.5545171339563862,
"grad_norm": 0.5064887404441833,
"learning_rate": 8e-05,
"loss": 1.5513,
"step": 2670
},
{
"epoch": 0.5547248182762201,
"grad_norm": 0.5283700823783875,
"learning_rate": 8e-05,
"loss": 1.6139,
"step": 2671
},
{
"epoch": 0.554932502596054,
"grad_norm": 0.5440184473991394,
"learning_rate": 8e-05,
"loss": 1.5557,
"step": 2672
},
{
"epoch": 0.5551401869158878,
"grad_norm": 0.5373353958129883,
"learning_rate": 8e-05,
"loss": 1.608,
"step": 2673
},
{
"epoch": 0.5553478712357217,
"grad_norm": 0.5456499457359314,
"learning_rate": 8e-05,
"loss": 1.6078,
"step": 2674
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.5141017436981201,
"learning_rate": 8e-05,
"loss": 1.5639,
"step": 2675
},
{
"epoch": 0.5557632398753894,
"grad_norm": 0.5070343613624573,
"learning_rate": 8e-05,
"loss": 1.5033,
"step": 2676
},
{
"epoch": 0.5559709241952233,
"grad_norm": 0.5327128171920776,
"learning_rate": 8e-05,
"loss": 1.5481,
"step": 2677
},
{
"epoch": 0.5561786085150571,
"grad_norm": 0.6137487888336182,
"learning_rate": 8e-05,
"loss": 1.6048,
"step": 2678
},
{
"epoch": 0.5563862928348909,
"grad_norm": 0.5100077390670776,
"learning_rate": 8e-05,
"loss": 1.4818,
"step": 2679
},
{
"epoch": 0.5565939771547248,
"grad_norm": 0.5046921372413635,
"learning_rate": 8e-05,
"loss": 1.5365,
"step": 2680
},
{
"epoch": 0.5568016614745587,
"grad_norm": 0.5135619044303894,
"learning_rate": 8e-05,
"loss": 1.5347,
"step": 2681
},
{
"epoch": 0.5570093457943925,
"grad_norm": 0.5596857666969299,
"learning_rate": 8e-05,
"loss": 1.5836,
"step": 2682
},
{
"epoch": 0.5572170301142264,
"grad_norm": 0.5241439938545227,
"learning_rate": 8e-05,
"loss": 1.6102,
"step": 2683
},
{
"epoch": 0.5574247144340603,
"grad_norm": 0.5305902361869812,
"learning_rate": 8e-05,
"loss": 1.5581,
"step": 2684
},
{
"epoch": 0.557632398753894,
"grad_norm": 0.4992052912712097,
"learning_rate": 8e-05,
"loss": 1.5119,
"step": 2685
},
{
"epoch": 0.5578400830737279,
"grad_norm": 0.5193141102790833,
"learning_rate": 8e-05,
"loss": 1.5624,
"step": 2686
},
{
"epoch": 0.5580477673935618,
"grad_norm": 0.5143951773643494,
"learning_rate": 8e-05,
"loss": 1.5473,
"step": 2687
},
{
"epoch": 0.5582554517133956,
"grad_norm": 0.5419348478317261,
"learning_rate": 8e-05,
"loss": 1.5579,
"step": 2688
},
{
"epoch": 0.5584631360332295,
"grad_norm": 0.5398904085159302,
"learning_rate": 8e-05,
"loss": 1.5888,
"step": 2689
},
{
"epoch": 0.5586708203530634,
"grad_norm": 0.5264995098114014,
"learning_rate": 8e-05,
"loss": 1.5997,
"step": 2690
},
{
"epoch": 0.5588785046728972,
"grad_norm": 0.5043335556983948,
"learning_rate": 8e-05,
"loss": 1.5461,
"step": 2691
},
{
"epoch": 0.5590861889927311,
"grad_norm": 0.5361884236335754,
"learning_rate": 8e-05,
"loss": 1.5636,
"step": 2692
},
{
"epoch": 0.5592938733125649,
"grad_norm": 0.5052306056022644,
"learning_rate": 8e-05,
"loss": 1.5238,
"step": 2693
},
{
"epoch": 0.5595015576323987,
"grad_norm": 0.49911513924598694,
"learning_rate": 8e-05,
"loss": 1.537,
"step": 2694
},
{
"epoch": 0.5597092419522326,
"grad_norm": 0.5349092483520508,
"learning_rate": 8e-05,
"loss": 1.5804,
"step": 2695
},
{
"epoch": 0.5599169262720665,
"grad_norm": 0.5042306780815125,
"learning_rate": 8e-05,
"loss": 1.4703,
"step": 2696
},
{
"epoch": 0.5601246105919003,
"grad_norm": 0.5325839519500732,
"learning_rate": 8e-05,
"loss": 1.5393,
"step": 2697
},
{
"epoch": 0.5603322949117342,
"grad_norm": 0.5401809811592102,
"learning_rate": 8e-05,
"loss": 1.6033,
"step": 2698
},
{
"epoch": 0.5605399792315681,
"grad_norm": 0.5372620224952698,
"learning_rate": 8e-05,
"loss": 1.4942,
"step": 2699
},
{
"epoch": 0.5607476635514018,
"grad_norm": 0.5419166684150696,
"learning_rate": 8e-05,
"loss": 1.5295,
"step": 2700
},
{
"epoch": 0.5609553478712357,
"grad_norm": 0.5134370923042297,
"learning_rate": 8e-05,
"loss": 1.4819,
"step": 2701
},
{
"epoch": 0.5611630321910696,
"grad_norm": 0.5202261209487915,
"learning_rate": 8e-05,
"loss": 1.5344,
"step": 2702
},
{
"epoch": 0.5613707165109034,
"grad_norm": 0.5219168066978455,
"learning_rate": 8e-05,
"loss": 1.5119,
"step": 2703
},
{
"epoch": 0.5615784008307373,
"grad_norm": 0.522109866142273,
"learning_rate": 8e-05,
"loss": 1.5409,
"step": 2704
},
{
"epoch": 0.5617860851505712,
"grad_norm": 0.5673719644546509,
"learning_rate": 8e-05,
"loss": 1.6408,
"step": 2705
},
{
"epoch": 0.561993769470405,
"grad_norm": 0.5494527816772461,
"learning_rate": 8e-05,
"loss": 1.5707,
"step": 2706
},
{
"epoch": 0.5622014537902389,
"grad_norm": 0.5136489272117615,
"learning_rate": 8e-05,
"loss": 1.5566,
"step": 2707
},
{
"epoch": 0.5624091381100726,
"grad_norm": 0.5121893286705017,
"learning_rate": 8e-05,
"loss": 1.5256,
"step": 2708
},
{
"epoch": 0.5626168224299065,
"grad_norm": 0.49312853813171387,
"learning_rate": 8e-05,
"loss": 1.5215,
"step": 2709
},
{
"epoch": 0.5628245067497404,
"grad_norm": 0.5472443103790283,
"learning_rate": 8e-05,
"loss": 1.5127,
"step": 2710
},
{
"epoch": 0.5630321910695743,
"grad_norm": 0.5221959352493286,
"learning_rate": 8e-05,
"loss": 1.5957,
"step": 2711
},
{
"epoch": 0.5632398753894081,
"grad_norm": 0.5420714020729065,
"learning_rate": 8e-05,
"loss": 1.5427,
"step": 2712
},
{
"epoch": 0.563447559709242,
"grad_norm": 0.5317118763923645,
"learning_rate": 8e-05,
"loss": 1.6281,
"step": 2713
},
{
"epoch": 0.5636552440290759,
"grad_norm": 0.531112790107727,
"learning_rate": 8e-05,
"loss": 1.5929,
"step": 2714
},
{
"epoch": 0.5638629283489096,
"grad_norm": 0.5138751864433289,
"learning_rate": 8e-05,
"loss": 1.5962,
"step": 2715
},
{
"epoch": 0.5640706126687435,
"grad_norm": 0.493773877620697,
"learning_rate": 8e-05,
"loss": 1.4706,
"step": 2716
},
{
"epoch": 0.5642782969885773,
"grad_norm": 0.5258544087409973,
"learning_rate": 8e-05,
"loss": 1.5494,
"step": 2717
},
{
"epoch": 0.5644859813084112,
"grad_norm": 0.5253569483757019,
"learning_rate": 8e-05,
"loss": 1.5538,
"step": 2718
},
{
"epoch": 0.5646936656282451,
"grad_norm": 0.5212979912757874,
"learning_rate": 8e-05,
"loss": 1.4899,
"step": 2719
},
{
"epoch": 0.564901349948079,
"grad_norm": 0.5010308027267456,
"learning_rate": 8e-05,
"loss": 1.5258,
"step": 2720
},
{
"epoch": 0.5651090342679128,
"grad_norm": 0.5242239236831665,
"learning_rate": 8e-05,
"loss": 1.5427,
"step": 2721
},
{
"epoch": 0.5653167185877467,
"grad_norm": 0.4979899227619171,
"learning_rate": 8e-05,
"loss": 1.5751,
"step": 2722
},
{
"epoch": 0.5655244029075804,
"grad_norm": 0.5082122087478638,
"learning_rate": 8e-05,
"loss": 1.5637,
"step": 2723
},
{
"epoch": 0.5657320872274143,
"grad_norm": 0.5392714738845825,
"learning_rate": 8e-05,
"loss": 1.6375,
"step": 2724
},
{
"epoch": 0.5659397715472482,
"grad_norm": 0.502173900604248,
"learning_rate": 8e-05,
"loss": 1.5923,
"step": 2725
},
{
"epoch": 0.566147455867082,
"grad_norm": 0.4920365810394287,
"learning_rate": 8e-05,
"loss": 1.5303,
"step": 2726
},
{
"epoch": 0.5663551401869159,
"grad_norm": 0.4974154233932495,
"learning_rate": 8e-05,
"loss": 1.5815,
"step": 2727
},
{
"epoch": 0.5665628245067498,
"grad_norm": 0.5457067489624023,
"learning_rate": 8e-05,
"loss": 1.5774,
"step": 2728
},
{
"epoch": 0.5667705088265836,
"grad_norm": 0.5052133202552795,
"learning_rate": 8e-05,
"loss": 1.508,
"step": 2729
},
{
"epoch": 0.5669781931464174,
"grad_norm": 0.5542793869972229,
"learning_rate": 8e-05,
"loss": 1.5994,
"step": 2730
},
{
"epoch": 0.5671858774662513,
"grad_norm": 0.5140535235404968,
"learning_rate": 8e-05,
"loss": 1.5062,
"step": 2731
},
{
"epoch": 0.5673935617860851,
"grad_norm": 0.5263282060623169,
"learning_rate": 8e-05,
"loss": 1.5932,
"step": 2732
},
{
"epoch": 0.567601246105919,
"grad_norm": 0.5152779817581177,
"learning_rate": 8e-05,
"loss": 1.4961,
"step": 2733
},
{
"epoch": 0.5678089304257529,
"grad_norm": 0.5263432860374451,
"learning_rate": 8e-05,
"loss": 1.5839,
"step": 2734
},
{
"epoch": 0.5680166147455867,
"grad_norm": 0.4997681677341461,
"learning_rate": 8e-05,
"loss": 1.5555,
"step": 2735
},
{
"epoch": 0.5682242990654206,
"grad_norm": 0.5276387929916382,
"learning_rate": 8e-05,
"loss": 1.5914,
"step": 2736
},
{
"epoch": 0.5684319833852544,
"grad_norm": 0.4941411316394806,
"learning_rate": 8e-05,
"loss": 1.5112,
"step": 2737
},
{
"epoch": 0.5686396677050882,
"grad_norm": 0.5030642151832581,
"learning_rate": 8e-05,
"loss": 1.4287,
"step": 2738
},
{
"epoch": 0.5688473520249221,
"grad_norm": 0.47869259119033813,
"learning_rate": 8e-05,
"loss": 1.5055,
"step": 2739
},
{
"epoch": 0.569055036344756,
"grad_norm": 0.5265507102012634,
"learning_rate": 8e-05,
"loss": 1.5973,
"step": 2740
},
{
"epoch": 0.5692627206645898,
"grad_norm": 0.528171956539154,
"learning_rate": 8e-05,
"loss": 1.5892,
"step": 2741
},
{
"epoch": 0.5694704049844237,
"grad_norm": 0.5014118552207947,
"learning_rate": 8e-05,
"loss": 1.5203,
"step": 2742
},
{
"epoch": 0.5696780893042576,
"grad_norm": 0.5167685747146606,
"learning_rate": 8e-05,
"loss": 1.564,
"step": 2743
},
{
"epoch": 0.5698857736240914,
"grad_norm": 0.5222561359405518,
"learning_rate": 8e-05,
"loss": 1.5243,
"step": 2744
},
{
"epoch": 0.5700934579439252,
"grad_norm": 0.5133991241455078,
"learning_rate": 8e-05,
"loss": 1.5403,
"step": 2745
},
{
"epoch": 0.570301142263759,
"grad_norm": 0.5201916098594666,
"learning_rate": 8e-05,
"loss": 1.5732,
"step": 2746
},
{
"epoch": 0.5705088265835929,
"grad_norm": 0.5097384452819824,
"learning_rate": 8e-05,
"loss": 1.5958,
"step": 2747
},
{
"epoch": 0.5707165109034268,
"grad_norm": 0.5123527646064758,
"learning_rate": 8e-05,
"loss": 1.5869,
"step": 2748
},
{
"epoch": 0.5709241952232607,
"grad_norm": 0.5189564228057861,
"learning_rate": 8e-05,
"loss": 1.5932,
"step": 2749
},
{
"epoch": 0.5711318795430945,
"grad_norm": 0.5192084312438965,
"learning_rate": 8e-05,
"loss": 1.5284,
"step": 2750
},
{
"epoch": 0.5713395638629284,
"grad_norm": 0.5026838183403015,
"learning_rate": 8e-05,
"loss": 1.5058,
"step": 2751
},
{
"epoch": 0.5715472481827621,
"grad_norm": 0.5398958325386047,
"learning_rate": 8e-05,
"loss": 1.6339,
"step": 2752
},
{
"epoch": 0.571754932502596,
"grad_norm": 0.5269039273262024,
"learning_rate": 8e-05,
"loss": 1.5635,
"step": 2753
},
{
"epoch": 0.5719626168224299,
"grad_norm": 0.5309609174728394,
"learning_rate": 8e-05,
"loss": 1.4882,
"step": 2754
},
{
"epoch": 0.5721703011422637,
"grad_norm": 0.5078904032707214,
"learning_rate": 8e-05,
"loss": 1.6141,
"step": 2755
},
{
"epoch": 0.5723779854620976,
"grad_norm": 0.5262061357498169,
"learning_rate": 8e-05,
"loss": 1.608,
"step": 2756
},
{
"epoch": 0.5725856697819315,
"grad_norm": 0.5232616662979126,
"learning_rate": 8e-05,
"loss": 1.5799,
"step": 2757
},
{
"epoch": 0.5727933541017654,
"grad_norm": 0.49849748611450195,
"learning_rate": 8e-05,
"loss": 1.5073,
"step": 2758
},
{
"epoch": 0.5730010384215992,
"grad_norm": 0.5224372744560242,
"learning_rate": 8e-05,
"loss": 1.6051,
"step": 2759
},
{
"epoch": 0.573208722741433,
"grad_norm": 0.518502414226532,
"learning_rate": 8e-05,
"loss": 1.5905,
"step": 2760
},
{
"epoch": 0.5734164070612668,
"grad_norm": 0.511958658695221,
"learning_rate": 8e-05,
"loss": 1.4851,
"step": 2761
},
{
"epoch": 0.5736240913811007,
"grad_norm": 0.5087369680404663,
"learning_rate": 8e-05,
"loss": 1.5221,
"step": 2762
},
{
"epoch": 0.5738317757009346,
"grad_norm": 0.7710164785385132,
"learning_rate": 8e-05,
"loss": 1.5034,
"step": 2763
},
{
"epoch": 0.5740394600207684,
"grad_norm": 0.5106395483016968,
"learning_rate": 8e-05,
"loss": 1.5988,
"step": 2764
},
{
"epoch": 0.5742471443406023,
"grad_norm": 0.5100325345993042,
"learning_rate": 8e-05,
"loss": 1.5096,
"step": 2765
},
{
"epoch": 0.5744548286604362,
"grad_norm": 0.49486199021339417,
"learning_rate": 8e-05,
"loss": 1.4733,
"step": 2766
},
{
"epoch": 0.5746625129802699,
"grad_norm": 0.5689396262168884,
"learning_rate": 8e-05,
"loss": 1.65,
"step": 2767
},
{
"epoch": 0.5748701973001038,
"grad_norm": 0.5266242623329163,
"learning_rate": 8e-05,
"loss": 1.5513,
"step": 2768
},
{
"epoch": 0.5750778816199377,
"grad_norm": 0.5084609985351562,
"learning_rate": 8e-05,
"loss": 1.5034,
"step": 2769
},
{
"epoch": 0.5752855659397715,
"grad_norm": 0.5007745027542114,
"learning_rate": 8e-05,
"loss": 1.5009,
"step": 2770
},
{
"epoch": 0.5754932502596054,
"grad_norm": 0.520416796207428,
"learning_rate": 8e-05,
"loss": 1.5231,
"step": 2771
},
{
"epoch": 0.5757009345794393,
"grad_norm": 0.5064281225204468,
"learning_rate": 8e-05,
"loss": 1.5356,
"step": 2772
},
{
"epoch": 0.5759086188992731,
"grad_norm": 0.5355567336082458,
"learning_rate": 8e-05,
"loss": 1.5831,
"step": 2773
},
{
"epoch": 0.576116303219107,
"grad_norm": 0.5114003419876099,
"learning_rate": 8e-05,
"loss": 1.5855,
"step": 2774
},
{
"epoch": 0.5763239875389408,
"grad_norm": 0.5105564594268799,
"learning_rate": 8e-05,
"loss": 1.5277,
"step": 2775
},
{
"epoch": 0.5765316718587746,
"grad_norm": 0.5268367528915405,
"learning_rate": 8e-05,
"loss": 1.607,
"step": 2776
},
{
"epoch": 0.5767393561786085,
"grad_norm": 0.5006731748580933,
"learning_rate": 8e-05,
"loss": 1.6212,
"step": 2777
},
{
"epoch": 0.5769470404984424,
"grad_norm": 0.5249113440513611,
"learning_rate": 8e-05,
"loss": 1.5704,
"step": 2778
},
{
"epoch": 0.5771547248182762,
"grad_norm": 0.5069587230682373,
"learning_rate": 8e-05,
"loss": 1.5597,
"step": 2779
},
{
"epoch": 0.5773624091381101,
"grad_norm": 0.5126855373382568,
"learning_rate": 8e-05,
"loss": 1.5091,
"step": 2780
},
{
"epoch": 0.577570093457944,
"grad_norm": 0.5261822938919067,
"learning_rate": 8e-05,
"loss": 1.4847,
"step": 2781
},
{
"epoch": 0.5777777777777777,
"grad_norm": 0.5000608563423157,
"learning_rate": 8e-05,
"loss": 1.5287,
"step": 2782
},
{
"epoch": 0.5779854620976116,
"grad_norm": 0.5185624361038208,
"learning_rate": 8e-05,
"loss": 1.5692,
"step": 2783
},
{
"epoch": 0.5781931464174455,
"grad_norm": 0.5299122929573059,
"learning_rate": 8e-05,
"loss": 1.5857,
"step": 2784
},
{
"epoch": 0.5784008307372793,
"grad_norm": 0.5279645323753357,
"learning_rate": 8e-05,
"loss": 1.586,
"step": 2785
},
{
"epoch": 0.5786085150571132,
"grad_norm": 0.5176546573638916,
"learning_rate": 8e-05,
"loss": 1.6287,
"step": 2786
},
{
"epoch": 0.5788161993769471,
"grad_norm": 0.512985348701477,
"learning_rate": 8e-05,
"loss": 1.5642,
"step": 2787
},
{
"epoch": 0.5790238836967809,
"grad_norm": 0.5028579831123352,
"learning_rate": 8e-05,
"loss": 1.4601,
"step": 2788
},
{
"epoch": 0.5792315680166148,
"grad_norm": 0.5143795609474182,
"learning_rate": 8e-05,
"loss": 1.5745,
"step": 2789
},
{
"epoch": 0.5794392523364486,
"grad_norm": 0.5350067019462585,
"learning_rate": 8e-05,
"loss": 1.6075,
"step": 2790
},
{
"epoch": 0.5796469366562824,
"grad_norm": 0.5133538842201233,
"learning_rate": 8e-05,
"loss": 1.6325,
"step": 2791
},
{
"epoch": 0.5798546209761163,
"grad_norm": 0.49931225180625916,
"learning_rate": 8e-05,
"loss": 1.5322,
"step": 2792
},
{
"epoch": 0.5800623052959502,
"grad_norm": 0.5204224586486816,
"learning_rate": 8e-05,
"loss": 1.5818,
"step": 2793
},
{
"epoch": 0.580269989615784,
"grad_norm": 0.5103965997695923,
"learning_rate": 8e-05,
"loss": 1.518,
"step": 2794
},
{
"epoch": 0.5804776739356179,
"grad_norm": 0.5115767121315002,
"learning_rate": 8e-05,
"loss": 1.5504,
"step": 2795
},
{
"epoch": 0.5806853582554518,
"grad_norm": 0.5340980291366577,
"learning_rate": 8e-05,
"loss": 1.5652,
"step": 2796
},
{
"epoch": 0.5808930425752855,
"grad_norm": 0.535073459148407,
"learning_rate": 8e-05,
"loss": 1.5424,
"step": 2797
},
{
"epoch": 0.5811007268951194,
"grad_norm": 0.5498014092445374,
"learning_rate": 8e-05,
"loss": 1.6218,
"step": 2798
},
{
"epoch": 0.5813084112149532,
"grad_norm": 0.5135794281959534,
"learning_rate": 8e-05,
"loss": 1.5992,
"step": 2799
},
{
"epoch": 0.5815160955347871,
"grad_norm": 0.5113550424575806,
"learning_rate": 8e-05,
"loss": 1.5281,
"step": 2800
},
{
"epoch": 0.581723779854621,
"grad_norm": 0.5498125553131104,
"learning_rate": 8e-05,
"loss": 1.5895,
"step": 2801
},
{
"epoch": 0.5819314641744548,
"grad_norm": 0.5104403495788574,
"learning_rate": 8e-05,
"loss": 1.4963,
"step": 2802
},
{
"epoch": 0.5821391484942887,
"grad_norm": 0.5074317455291748,
"learning_rate": 8e-05,
"loss": 1.5134,
"step": 2803
},
{
"epoch": 0.5823468328141226,
"grad_norm": 0.5222622156143188,
"learning_rate": 8e-05,
"loss": 1.525,
"step": 2804
},
{
"epoch": 0.5825545171339563,
"grad_norm": 0.5213282108306885,
"learning_rate": 8e-05,
"loss": 1.5767,
"step": 2805
},
{
"epoch": 0.5827622014537902,
"grad_norm": 0.5274381637573242,
"learning_rate": 8e-05,
"loss": 1.6413,
"step": 2806
},
{
"epoch": 0.5829698857736241,
"grad_norm": 0.5199068784713745,
"learning_rate": 8e-05,
"loss": 1.5654,
"step": 2807
},
{
"epoch": 0.5831775700934579,
"grad_norm": 0.5022305846214294,
"learning_rate": 8e-05,
"loss": 1.5406,
"step": 2808
},
{
"epoch": 0.5833852544132918,
"grad_norm": 0.5012241005897522,
"learning_rate": 8e-05,
"loss": 1.5701,
"step": 2809
},
{
"epoch": 0.5835929387331257,
"grad_norm": 0.5507093667984009,
"learning_rate": 8e-05,
"loss": 1.5963,
"step": 2810
},
{
"epoch": 0.5838006230529595,
"grad_norm": 0.5283317565917969,
"learning_rate": 8e-05,
"loss": 1.5609,
"step": 2811
},
{
"epoch": 0.5840083073727933,
"grad_norm": 0.5261059403419495,
"learning_rate": 8e-05,
"loss": 1.5373,
"step": 2812
},
{
"epoch": 0.5842159916926272,
"grad_norm": 0.5131132006645203,
"learning_rate": 8e-05,
"loss": 1.5572,
"step": 2813
},
{
"epoch": 0.584423676012461,
"grad_norm": 0.5188307762145996,
"learning_rate": 8e-05,
"loss": 1.5868,
"step": 2814
},
{
"epoch": 0.5846313603322949,
"grad_norm": 0.514680802822113,
"learning_rate": 8e-05,
"loss": 1.5561,
"step": 2815
},
{
"epoch": 0.5848390446521288,
"grad_norm": 0.5244585275650024,
"learning_rate": 8e-05,
"loss": 1.6002,
"step": 2816
},
{
"epoch": 0.5850467289719626,
"grad_norm": 0.5245557427406311,
"learning_rate": 8e-05,
"loss": 1.6044,
"step": 2817
},
{
"epoch": 0.5852544132917965,
"grad_norm": 0.51822429895401,
"learning_rate": 8e-05,
"loss": 1.5826,
"step": 2818
},
{
"epoch": 0.5854620976116304,
"grad_norm": 0.5252143144607544,
"learning_rate": 8e-05,
"loss": 1.5347,
"step": 2819
},
{
"epoch": 0.5856697819314641,
"grad_norm": 0.5217255353927612,
"learning_rate": 8e-05,
"loss": 1.5964,
"step": 2820
},
{
"epoch": 0.585877466251298,
"grad_norm": 0.5508246421813965,
"learning_rate": 8e-05,
"loss": 1.6135,
"step": 2821
},
{
"epoch": 0.5860851505711319,
"grad_norm": 0.5165861248970032,
"learning_rate": 8e-05,
"loss": 1.5907,
"step": 2822
},
{
"epoch": 0.5862928348909657,
"grad_norm": 0.5337631702423096,
"learning_rate": 8e-05,
"loss": 1.5683,
"step": 2823
},
{
"epoch": 0.5865005192107996,
"grad_norm": 0.5257221460342407,
"learning_rate": 8e-05,
"loss": 1.6359,
"step": 2824
},
{
"epoch": 0.5867082035306335,
"grad_norm": 0.5221315622329712,
"learning_rate": 8e-05,
"loss": 1.5157,
"step": 2825
},
{
"epoch": 0.5869158878504673,
"grad_norm": 0.5321351289749146,
"learning_rate": 8e-05,
"loss": 1.5844,
"step": 2826
},
{
"epoch": 0.5871235721703011,
"grad_norm": 0.5378949046134949,
"learning_rate": 8e-05,
"loss": 1.5158,
"step": 2827
},
{
"epoch": 0.587331256490135,
"grad_norm": 0.5337066650390625,
"learning_rate": 8e-05,
"loss": 1.537,
"step": 2828
},
{
"epoch": 0.5875389408099688,
"grad_norm": 0.5137457251548767,
"learning_rate": 8e-05,
"loss": 1.5569,
"step": 2829
},
{
"epoch": 0.5877466251298027,
"grad_norm": 0.5192795395851135,
"learning_rate": 8e-05,
"loss": 1.5328,
"step": 2830
},
{
"epoch": 0.5879543094496366,
"grad_norm": 0.5277906656265259,
"learning_rate": 8e-05,
"loss": 1.5183,
"step": 2831
},
{
"epoch": 0.5881619937694704,
"grad_norm": 0.550859272480011,
"learning_rate": 8e-05,
"loss": 1.6268,
"step": 2832
},
{
"epoch": 0.5883696780893043,
"grad_norm": 0.518708348274231,
"learning_rate": 8e-05,
"loss": 1.4964,
"step": 2833
},
{
"epoch": 0.5885773624091382,
"grad_norm": 0.5725675225257874,
"learning_rate": 8e-05,
"loss": 1.6776,
"step": 2834
},
{
"epoch": 0.5887850467289719,
"grad_norm": 0.512139081954956,
"learning_rate": 8e-05,
"loss": 1.6053,
"step": 2835
},
{
"epoch": 0.5889927310488058,
"grad_norm": 0.5205104947090149,
"learning_rate": 8e-05,
"loss": 1.5682,
"step": 2836
},
{
"epoch": 0.5892004153686397,
"grad_norm": 0.5213608741760254,
"learning_rate": 8e-05,
"loss": 1.5272,
"step": 2837
},
{
"epoch": 0.5894080996884735,
"grad_norm": 0.51758873462677,
"learning_rate": 8e-05,
"loss": 1.6262,
"step": 2838
},
{
"epoch": 0.5896157840083074,
"grad_norm": 0.5142424702644348,
"learning_rate": 8e-05,
"loss": 1.59,
"step": 2839
},
{
"epoch": 0.5898234683281413,
"grad_norm": 0.5190350413322449,
"learning_rate": 8e-05,
"loss": 1.586,
"step": 2840
},
{
"epoch": 0.5900311526479751,
"grad_norm": 0.5234148502349854,
"learning_rate": 8e-05,
"loss": 1.5719,
"step": 2841
},
{
"epoch": 0.5902388369678089,
"grad_norm": 0.5219846367835999,
"learning_rate": 8e-05,
"loss": 1.5988,
"step": 2842
},
{
"epoch": 0.5904465212876427,
"grad_norm": 0.5159264206886292,
"learning_rate": 8e-05,
"loss": 1.458,
"step": 2843
},
{
"epoch": 0.5906542056074766,
"grad_norm": 0.5145145058631897,
"learning_rate": 8e-05,
"loss": 1.5651,
"step": 2844
},
{
"epoch": 0.5908618899273105,
"grad_norm": 0.5303623676300049,
"learning_rate": 8e-05,
"loss": 1.5558,
"step": 2845
},
{
"epoch": 0.5910695742471443,
"grad_norm": 0.5262109041213989,
"learning_rate": 8e-05,
"loss": 1.5433,
"step": 2846
},
{
"epoch": 0.5912772585669782,
"grad_norm": 0.5659428834915161,
"learning_rate": 8e-05,
"loss": 1.5229,
"step": 2847
},
{
"epoch": 0.5914849428868121,
"grad_norm": 0.5190215110778809,
"learning_rate": 8e-05,
"loss": 1.5649,
"step": 2848
},
{
"epoch": 0.591692627206646,
"grad_norm": 0.5212072134017944,
"learning_rate": 8e-05,
"loss": 1.4807,
"step": 2849
},
{
"epoch": 0.5919003115264797,
"grad_norm": 0.5107869505882263,
"learning_rate": 8e-05,
"loss": 1.5145,
"step": 2850
},
{
"epoch": 0.5921079958463136,
"grad_norm": 0.4874977469444275,
"learning_rate": 8e-05,
"loss": 1.5159,
"step": 2851
},
{
"epoch": 0.5923156801661474,
"grad_norm": 0.5429779291152954,
"learning_rate": 8e-05,
"loss": 1.5825,
"step": 2852
},
{
"epoch": 0.5925233644859813,
"grad_norm": 0.5432573556900024,
"learning_rate": 8e-05,
"loss": 1.6157,
"step": 2853
},
{
"epoch": 0.5927310488058152,
"grad_norm": 0.5516713261604309,
"learning_rate": 8e-05,
"loss": 1.5427,
"step": 2854
},
{
"epoch": 0.592938733125649,
"grad_norm": 0.5550795197486877,
"learning_rate": 8e-05,
"loss": 1.6004,
"step": 2855
},
{
"epoch": 0.5931464174454829,
"grad_norm": 0.5827699899673462,
"learning_rate": 8e-05,
"loss": 1.5963,
"step": 2856
},
{
"epoch": 0.5933541017653167,
"grad_norm": 0.5084548592567444,
"learning_rate": 8e-05,
"loss": 1.5318,
"step": 2857
},
{
"epoch": 0.5935617860851505,
"grad_norm": 0.49697956442832947,
"learning_rate": 8e-05,
"loss": 1.5825,
"step": 2858
},
{
"epoch": 0.5937694704049844,
"grad_norm": 0.5428705215454102,
"learning_rate": 8e-05,
"loss": 1.5933,
"step": 2859
},
{
"epoch": 0.5939771547248183,
"grad_norm": 0.5335211157798767,
"learning_rate": 8e-05,
"loss": 1.5817,
"step": 2860
},
{
"epoch": 0.5941848390446521,
"grad_norm": 0.524454653263092,
"learning_rate": 8e-05,
"loss": 1.5548,
"step": 2861
},
{
"epoch": 0.594392523364486,
"grad_norm": 0.5151386857032776,
"learning_rate": 8e-05,
"loss": 1.5608,
"step": 2862
},
{
"epoch": 0.5946002076843199,
"grad_norm": 0.5568569898605347,
"learning_rate": 8e-05,
"loss": 1.6079,
"step": 2863
},
{
"epoch": 0.5948078920041537,
"grad_norm": 0.5202062726020813,
"learning_rate": 8e-05,
"loss": 1.5789,
"step": 2864
},
{
"epoch": 0.5950155763239875,
"grad_norm": 0.5226384997367859,
"learning_rate": 8e-05,
"loss": 1.6255,
"step": 2865
},
{
"epoch": 0.5952232606438214,
"grad_norm": 0.5193942189216614,
"learning_rate": 8e-05,
"loss": 1.5521,
"step": 2866
},
{
"epoch": 0.5954309449636552,
"grad_norm": 0.5268439650535583,
"learning_rate": 8e-05,
"loss": 1.5577,
"step": 2867
},
{
"epoch": 0.5956386292834891,
"grad_norm": 0.5028793811798096,
"learning_rate": 8e-05,
"loss": 1.5542,
"step": 2868
},
{
"epoch": 0.595846313603323,
"grad_norm": 0.5032245516777039,
"learning_rate": 8e-05,
"loss": 1.5401,
"step": 2869
},
{
"epoch": 0.5960539979231568,
"grad_norm": 0.5338500738143921,
"learning_rate": 8e-05,
"loss": 1.6173,
"step": 2870
},
{
"epoch": 0.5962616822429907,
"grad_norm": 0.5496775507926941,
"learning_rate": 8e-05,
"loss": 1.5215,
"step": 2871
},
{
"epoch": 0.5964693665628245,
"grad_norm": 0.5108234882354736,
"learning_rate": 8e-05,
"loss": 1.5494,
"step": 2872
},
{
"epoch": 0.5966770508826583,
"grad_norm": 0.5167278051376343,
"learning_rate": 8e-05,
"loss": 1.4923,
"step": 2873
},
{
"epoch": 0.5968847352024922,
"grad_norm": 0.5146141052246094,
"learning_rate": 8e-05,
"loss": 1.5076,
"step": 2874
},
{
"epoch": 0.5970924195223261,
"grad_norm": 0.5203628540039062,
"learning_rate": 8e-05,
"loss": 1.5583,
"step": 2875
},
{
"epoch": 0.5973001038421599,
"grad_norm": 0.5065957903862,
"learning_rate": 8e-05,
"loss": 1.5551,
"step": 2876
},
{
"epoch": 0.5975077881619938,
"grad_norm": 0.5048226118087769,
"learning_rate": 8e-05,
"loss": 1.4814,
"step": 2877
},
{
"epoch": 0.5977154724818277,
"grad_norm": 0.5416272282600403,
"learning_rate": 8e-05,
"loss": 1.6222,
"step": 2878
},
{
"epoch": 0.5979231568016615,
"grad_norm": 0.5329111814498901,
"learning_rate": 8e-05,
"loss": 1.5685,
"step": 2879
},
{
"epoch": 0.5981308411214953,
"grad_norm": 0.5529356002807617,
"learning_rate": 8e-05,
"loss": 1.6551,
"step": 2880
},
{
"epoch": 0.5983385254413291,
"grad_norm": 0.5571921467781067,
"learning_rate": 8e-05,
"loss": 1.6435,
"step": 2881
},
{
"epoch": 0.598546209761163,
"grad_norm": 0.5245065093040466,
"learning_rate": 8e-05,
"loss": 1.5972,
"step": 2882
},
{
"epoch": 0.5987538940809969,
"grad_norm": 0.5337019562721252,
"learning_rate": 8e-05,
"loss": 1.5105,
"step": 2883
},
{
"epoch": 0.5989615784008308,
"grad_norm": 0.5535258650779724,
"learning_rate": 8e-05,
"loss": 1.6439,
"step": 2884
},
{
"epoch": 0.5991692627206646,
"grad_norm": 0.510281503200531,
"learning_rate": 8e-05,
"loss": 1.5489,
"step": 2885
},
{
"epoch": 0.5993769470404985,
"grad_norm": 0.5224695801734924,
"learning_rate": 8e-05,
"loss": 1.5011,
"step": 2886
},
{
"epoch": 0.5995846313603322,
"grad_norm": 0.533809244632721,
"learning_rate": 8e-05,
"loss": 1.6423,
"step": 2887
},
{
"epoch": 0.5997923156801661,
"grad_norm": 0.574894368648529,
"learning_rate": 8e-05,
"loss": 1.5208,
"step": 2888
},
{
"epoch": 0.6,
"grad_norm": 0.5299583673477173,
"learning_rate": 8e-05,
"loss": 1.5983,
"step": 2889
},
{
"epoch": 0.6002076843198338,
"grad_norm": 0.5273470878601074,
"learning_rate": 8e-05,
"loss": 1.5455,
"step": 2890
},
{
"epoch": 0.6004153686396677,
"grad_norm": 0.5376371145248413,
"learning_rate": 8e-05,
"loss": 1.5338,
"step": 2891
},
{
"epoch": 0.6006230529595016,
"grad_norm": 0.5445219278335571,
"learning_rate": 8e-05,
"loss": 1.6045,
"step": 2892
},
{
"epoch": 0.6008307372793354,
"grad_norm": 0.5201619267463684,
"learning_rate": 8e-05,
"loss": 1.5215,
"step": 2893
},
{
"epoch": 0.6010384215991693,
"grad_norm": 0.5154725909233093,
"learning_rate": 8e-05,
"loss": 1.5279,
"step": 2894
},
{
"epoch": 0.6012461059190031,
"grad_norm": 0.5375370979309082,
"learning_rate": 8e-05,
"loss": 1.5252,
"step": 2895
},
{
"epoch": 0.6014537902388369,
"grad_norm": 0.5726920366287231,
"learning_rate": 8e-05,
"loss": 1.5772,
"step": 2896
},
{
"epoch": 0.6016614745586708,
"grad_norm": 0.5069287419319153,
"learning_rate": 8e-05,
"loss": 1.5117,
"step": 2897
},
{
"epoch": 0.6018691588785047,
"grad_norm": 0.5421019792556763,
"learning_rate": 8e-05,
"loss": 1.5698,
"step": 2898
},
{
"epoch": 0.6020768431983385,
"grad_norm": 0.5770023465156555,
"learning_rate": 8e-05,
"loss": 1.6081,
"step": 2899
},
{
"epoch": 0.6022845275181724,
"grad_norm": 0.5031346678733826,
"learning_rate": 8e-05,
"loss": 1.5555,
"step": 2900
},
{
"epoch": 0.6024922118380063,
"grad_norm": 0.5268486142158508,
"learning_rate": 8e-05,
"loss": 1.5491,
"step": 2901
},
{
"epoch": 0.60269989615784,
"grad_norm": 0.5094104409217834,
"learning_rate": 8e-05,
"loss": 1.546,
"step": 2902
},
{
"epoch": 0.6029075804776739,
"grad_norm": 0.5445466637611389,
"learning_rate": 8e-05,
"loss": 1.5345,
"step": 2903
},
{
"epoch": 0.6031152647975078,
"grad_norm": 0.5537915229797363,
"learning_rate": 8e-05,
"loss": 1.5728,
"step": 2904
},
{
"epoch": 0.6033229491173416,
"grad_norm": 0.5232887268066406,
"learning_rate": 8e-05,
"loss": 1.5328,
"step": 2905
},
{
"epoch": 0.6035306334371755,
"grad_norm": 0.5250490307807922,
"learning_rate": 8e-05,
"loss": 1.6072,
"step": 2906
},
{
"epoch": 0.6037383177570094,
"grad_norm": 0.5186933279037476,
"learning_rate": 8e-05,
"loss": 1.5354,
"step": 2907
},
{
"epoch": 0.6039460020768432,
"grad_norm": 0.5262655019760132,
"learning_rate": 8e-05,
"loss": 1.5914,
"step": 2908
},
{
"epoch": 0.6041536863966771,
"grad_norm": 0.5019351243972778,
"learning_rate": 8e-05,
"loss": 1.5235,
"step": 2909
},
{
"epoch": 0.6043613707165109,
"grad_norm": 0.4954097270965576,
"learning_rate": 8e-05,
"loss": 1.5543,
"step": 2910
},
{
"epoch": 0.6045690550363447,
"grad_norm": 0.512431800365448,
"learning_rate": 8e-05,
"loss": 1.5578,
"step": 2911
},
{
"epoch": 0.6047767393561786,
"grad_norm": 0.5076355934143066,
"learning_rate": 8e-05,
"loss": 1.561,
"step": 2912
},
{
"epoch": 0.6049844236760125,
"grad_norm": 0.5237712860107422,
"learning_rate": 8e-05,
"loss": 1.5584,
"step": 2913
},
{
"epoch": 0.6051921079958463,
"grad_norm": 0.5152472257614136,
"learning_rate": 8e-05,
"loss": 1.5429,
"step": 2914
},
{
"epoch": 0.6053997923156802,
"grad_norm": 0.5119134783744812,
"learning_rate": 8e-05,
"loss": 1.5631,
"step": 2915
},
{
"epoch": 0.6056074766355141,
"grad_norm": 0.5183678865432739,
"learning_rate": 8e-05,
"loss": 1.5855,
"step": 2916
},
{
"epoch": 0.6058151609553478,
"grad_norm": 0.49895596504211426,
"learning_rate": 8e-05,
"loss": 1.5152,
"step": 2917
},
{
"epoch": 0.6060228452751817,
"grad_norm": 0.5047574043273926,
"learning_rate": 8e-05,
"loss": 1.5017,
"step": 2918
},
{
"epoch": 0.6062305295950156,
"grad_norm": 0.5099537372589111,
"learning_rate": 8e-05,
"loss": 1.6106,
"step": 2919
},
{
"epoch": 0.6064382139148494,
"grad_norm": 0.5520958304405212,
"learning_rate": 8e-05,
"loss": 1.6264,
"step": 2920
},
{
"epoch": 0.6066458982346833,
"grad_norm": 0.5420238375663757,
"learning_rate": 8e-05,
"loss": 1.4995,
"step": 2921
},
{
"epoch": 0.6068535825545172,
"grad_norm": 0.5576627850532532,
"learning_rate": 8e-05,
"loss": 1.559,
"step": 2922
},
{
"epoch": 0.607061266874351,
"grad_norm": 0.5126776695251465,
"learning_rate": 8e-05,
"loss": 1.5922,
"step": 2923
},
{
"epoch": 0.6072689511941849,
"grad_norm": 0.525304913520813,
"learning_rate": 8e-05,
"loss": 1.6135,
"step": 2924
},
{
"epoch": 0.6074766355140186,
"grad_norm": 0.5227261781692505,
"learning_rate": 8e-05,
"loss": 1.6031,
"step": 2925
},
{
"epoch": 0.6076843198338525,
"grad_norm": 0.5197954773902893,
"learning_rate": 8e-05,
"loss": 1.5591,
"step": 2926
},
{
"epoch": 0.6078920041536864,
"grad_norm": 0.5202577114105225,
"learning_rate": 8e-05,
"loss": 1.5878,
"step": 2927
},
{
"epoch": 0.6080996884735203,
"grad_norm": 0.5209299921989441,
"learning_rate": 8e-05,
"loss": 1.5847,
"step": 2928
},
{
"epoch": 0.6083073727933541,
"grad_norm": 0.5529150366783142,
"learning_rate": 8e-05,
"loss": 1.5966,
"step": 2929
},
{
"epoch": 0.608515057113188,
"grad_norm": 0.5113958120346069,
"learning_rate": 8e-05,
"loss": 1.5736,
"step": 2930
},
{
"epoch": 0.6087227414330219,
"grad_norm": 0.5247915983200073,
"learning_rate": 8e-05,
"loss": 1.5975,
"step": 2931
},
{
"epoch": 0.6089304257528556,
"grad_norm": 0.5492368340492249,
"learning_rate": 8e-05,
"loss": 1.5366,
"step": 2932
},
{
"epoch": 0.6091381100726895,
"grad_norm": 0.5445862412452698,
"learning_rate": 8e-05,
"loss": 1.5235,
"step": 2933
},
{
"epoch": 0.6093457943925233,
"grad_norm": 0.5135022401809692,
"learning_rate": 8e-05,
"loss": 1.5669,
"step": 2934
},
{
"epoch": 0.6095534787123572,
"grad_norm": 0.5535476803779602,
"learning_rate": 8e-05,
"loss": 1.5846,
"step": 2935
},
{
"epoch": 0.6097611630321911,
"grad_norm": 0.5978537797927856,
"learning_rate": 8e-05,
"loss": 1.5923,
"step": 2936
},
{
"epoch": 0.609968847352025,
"grad_norm": 0.5011223554611206,
"learning_rate": 8e-05,
"loss": 1.5336,
"step": 2937
},
{
"epoch": 0.6101765316718588,
"grad_norm": 0.5410469770431519,
"learning_rate": 8e-05,
"loss": 1.5075,
"step": 2938
},
{
"epoch": 0.6103842159916927,
"grad_norm": 0.5120070576667786,
"learning_rate": 8e-05,
"loss": 1.5891,
"step": 2939
},
{
"epoch": 0.6105919003115264,
"grad_norm": 0.6055616140365601,
"learning_rate": 8e-05,
"loss": 1.4815,
"step": 2940
},
{
"epoch": 0.6107995846313603,
"grad_norm": 0.5197583436965942,
"learning_rate": 8e-05,
"loss": 1.5151,
"step": 2941
},
{
"epoch": 0.6110072689511942,
"grad_norm": 0.5293586850166321,
"learning_rate": 8e-05,
"loss": 1.6045,
"step": 2942
},
{
"epoch": 0.611214953271028,
"grad_norm": 0.5343866348266602,
"learning_rate": 8e-05,
"loss": 1.5406,
"step": 2943
},
{
"epoch": 0.6114226375908619,
"grad_norm": 0.5327885746955872,
"learning_rate": 8e-05,
"loss": 1.5581,
"step": 2944
},
{
"epoch": 0.6116303219106958,
"grad_norm": 0.5412554740905762,
"learning_rate": 8e-05,
"loss": 1.6321,
"step": 2945
},
{
"epoch": 0.6118380062305296,
"grad_norm": 0.5238969922065735,
"learning_rate": 8e-05,
"loss": 1.611,
"step": 2946
},
{
"epoch": 0.6120456905503634,
"grad_norm": 0.5044591426849365,
"learning_rate": 8e-05,
"loss": 1.5551,
"step": 2947
},
{
"epoch": 0.6122533748701973,
"grad_norm": 0.6135985255241394,
"learning_rate": 8e-05,
"loss": 1.5487,
"step": 2948
},
{
"epoch": 0.6124610591900311,
"grad_norm": 0.5363000631332397,
"learning_rate": 8e-05,
"loss": 1.5965,
"step": 2949
},
{
"epoch": 0.612668743509865,
"grad_norm": 0.5959219932556152,
"learning_rate": 8e-05,
"loss": 1.642,
"step": 2950
},
{
"epoch": 0.6128764278296989,
"grad_norm": 0.5022416710853577,
"learning_rate": 8e-05,
"loss": 1.5019,
"step": 2951
},
{
"epoch": 0.6130841121495327,
"grad_norm": 0.5241201519966125,
"learning_rate": 8e-05,
"loss": 1.5078,
"step": 2952
},
{
"epoch": 0.6132917964693666,
"grad_norm": 0.5270079970359802,
"learning_rate": 8e-05,
"loss": 1.5387,
"step": 2953
},
{
"epoch": 0.6134994807892005,
"grad_norm": 0.548295795917511,
"learning_rate": 8e-05,
"loss": 1.6071,
"step": 2954
},
{
"epoch": 0.6137071651090342,
"grad_norm": 0.5182678699493408,
"learning_rate": 8e-05,
"loss": 1.5863,
"step": 2955
},
{
"epoch": 0.6139148494288681,
"grad_norm": 0.5253292918205261,
"learning_rate": 8e-05,
"loss": 1.5464,
"step": 2956
},
{
"epoch": 0.614122533748702,
"grad_norm": 0.492260605096817,
"learning_rate": 8e-05,
"loss": 1.4529,
"step": 2957
},
{
"epoch": 0.6143302180685358,
"grad_norm": 0.6125956773757935,
"learning_rate": 8e-05,
"loss": 1.5692,
"step": 2958
},
{
"epoch": 0.6145379023883697,
"grad_norm": 0.6347879767417908,
"learning_rate": 8e-05,
"loss": 1.6548,
"step": 2959
},
{
"epoch": 0.6147455867082036,
"grad_norm": 0.5303540825843811,
"learning_rate": 8e-05,
"loss": 1.6188,
"step": 2960
},
{
"epoch": 0.6149532710280374,
"grad_norm": 0.5319226384162903,
"learning_rate": 8e-05,
"loss": 1.497,
"step": 2961
},
{
"epoch": 0.6151609553478712,
"grad_norm": 0.5587071180343628,
"learning_rate": 8e-05,
"loss": 1.6561,
"step": 2962
},
{
"epoch": 0.615368639667705,
"grad_norm": 0.5441991686820984,
"learning_rate": 8e-05,
"loss": 1.5086,
"step": 2963
},
{
"epoch": 0.6155763239875389,
"grad_norm": 0.4938202500343323,
"learning_rate": 8e-05,
"loss": 1.5147,
"step": 2964
},
{
"epoch": 0.6157840083073728,
"grad_norm": 0.5410319566726685,
"learning_rate": 8e-05,
"loss": 1.6568,
"step": 2965
},
{
"epoch": 0.6159916926272067,
"grad_norm": 0.531709611415863,
"learning_rate": 8e-05,
"loss": 1.5243,
"step": 2966
},
{
"epoch": 0.6161993769470405,
"grad_norm": 0.5373491644859314,
"learning_rate": 8e-05,
"loss": 1.581,
"step": 2967
},
{
"epoch": 0.6164070612668744,
"grad_norm": 0.5017946362495422,
"learning_rate": 8e-05,
"loss": 1.5224,
"step": 2968
},
{
"epoch": 0.6166147455867083,
"grad_norm": 0.510314404964447,
"learning_rate": 8e-05,
"loss": 1.6072,
"step": 2969
},
{
"epoch": 0.616822429906542,
"grad_norm": 0.519645631313324,
"learning_rate": 8e-05,
"loss": 1.5428,
"step": 2970
},
{
"epoch": 0.6170301142263759,
"grad_norm": 0.5236020088195801,
"learning_rate": 8e-05,
"loss": 1.5758,
"step": 2971
},
{
"epoch": 0.6172377985462097,
"grad_norm": 0.521712064743042,
"learning_rate": 8e-05,
"loss": 1.5461,
"step": 2972
},
{
"epoch": 0.6174454828660436,
"grad_norm": 0.5110123157501221,
"learning_rate": 8e-05,
"loss": 1.5284,
"step": 2973
},
{
"epoch": 0.6176531671858775,
"grad_norm": 0.5233707427978516,
"learning_rate": 8e-05,
"loss": 1.5502,
"step": 2974
},
{
"epoch": 0.6178608515057114,
"grad_norm": 0.532986581325531,
"learning_rate": 8e-05,
"loss": 1.5442,
"step": 2975
},
{
"epoch": 0.6180685358255452,
"grad_norm": 0.5192633867263794,
"learning_rate": 8e-05,
"loss": 1.5694,
"step": 2976
},
{
"epoch": 0.618276220145379,
"grad_norm": 0.5456683039665222,
"learning_rate": 8e-05,
"loss": 1.545,
"step": 2977
},
{
"epoch": 0.6184839044652128,
"grad_norm": 0.5262755751609802,
"learning_rate": 8e-05,
"loss": 1.5764,
"step": 2978
},
{
"epoch": 0.6186915887850467,
"grad_norm": 0.5116017460823059,
"learning_rate": 8e-05,
"loss": 1.6291,
"step": 2979
},
{
"epoch": 0.6188992731048806,
"grad_norm": 0.5196945667266846,
"learning_rate": 8e-05,
"loss": 1.5285,
"step": 2980
},
{
"epoch": 0.6191069574247144,
"grad_norm": 0.525067925453186,
"learning_rate": 8e-05,
"loss": 1.5704,
"step": 2981
},
{
"epoch": 0.6193146417445483,
"grad_norm": 0.5199265480041504,
"learning_rate": 8e-05,
"loss": 1.5554,
"step": 2982
},
{
"epoch": 0.6195223260643822,
"grad_norm": 0.5034192204475403,
"learning_rate": 8e-05,
"loss": 1.5135,
"step": 2983
},
{
"epoch": 0.619730010384216,
"grad_norm": 0.5294010043144226,
"learning_rate": 8e-05,
"loss": 1.5838,
"step": 2984
},
{
"epoch": 0.6199376947040498,
"grad_norm": 0.5039417743682861,
"learning_rate": 8e-05,
"loss": 1.5084,
"step": 2985
},
{
"epoch": 0.6201453790238837,
"grad_norm": 0.5068845152854919,
"learning_rate": 8e-05,
"loss": 1.5872,
"step": 2986
},
{
"epoch": 0.6203530633437175,
"grad_norm": 0.521058201789856,
"learning_rate": 8e-05,
"loss": 1.4993,
"step": 2987
},
{
"epoch": 0.6205607476635514,
"grad_norm": 0.5185069441795349,
"learning_rate": 8e-05,
"loss": 1.54,
"step": 2988
},
{
"epoch": 0.6207684319833853,
"grad_norm": 0.5288187265396118,
"learning_rate": 8e-05,
"loss": 1.5946,
"step": 2989
},
{
"epoch": 0.6209761163032191,
"grad_norm": 0.5338004231452942,
"learning_rate": 8e-05,
"loss": 1.5141,
"step": 2990
},
{
"epoch": 0.621183800623053,
"grad_norm": 0.5099069476127625,
"learning_rate": 8e-05,
"loss": 1.5778,
"step": 2991
},
{
"epoch": 0.6213914849428868,
"grad_norm": 0.5432291626930237,
"learning_rate": 8e-05,
"loss": 1.5671,
"step": 2992
},
{
"epoch": 0.6215991692627206,
"grad_norm": 0.5076735019683838,
"learning_rate": 8e-05,
"loss": 1.5615,
"step": 2993
},
{
"epoch": 0.6218068535825545,
"grad_norm": 0.5396541357040405,
"learning_rate": 8e-05,
"loss": 1.5829,
"step": 2994
},
{
"epoch": 0.6220145379023884,
"grad_norm": 0.5078719854354858,
"learning_rate": 8e-05,
"loss": 1.5176,
"step": 2995
},
{
"epoch": 0.6222222222222222,
"grad_norm": 0.503608226776123,
"learning_rate": 8e-05,
"loss": 1.531,
"step": 2996
},
{
"epoch": 0.6224299065420561,
"grad_norm": 0.5179398655891418,
"learning_rate": 8e-05,
"loss": 1.6097,
"step": 2997
},
{
"epoch": 0.62263759086189,
"grad_norm": 0.5163557529449463,
"learning_rate": 8e-05,
"loss": 1.6221,
"step": 2998
},
{
"epoch": 0.6228452751817238,
"grad_norm": 0.5109874606132507,
"learning_rate": 8e-05,
"loss": 1.5046,
"step": 2999
},
{
"epoch": 0.6230529595015576,
"grad_norm": 0.5169478058815002,
"learning_rate": 8e-05,
"loss": 1.5841,
"step": 3000
}
],
"logging_steps": 1,
"max_steps": 4815,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.7030412912033792e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}