wdli's picture
Upload 22 files
5758f6d verified
raw
history blame contribute delete
No virus
174 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.8149190710767065,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0028149190710767065,
"grad_norm": 0.3785315454006195,
"learning_rate": 4e-05,
"loss": 2.7534,
"step": 1
},
{
"epoch": 0.005629838142153413,
"grad_norm": 0.3462165296077728,
"learning_rate": 8e-05,
"loss": 2.7611,
"step": 2
},
{
"epoch": 0.00844475721323012,
"grad_norm": 0.41192108392715454,
"learning_rate": 0.00012,
"loss": 2.9121,
"step": 3
},
{
"epoch": 0.011259676284306826,
"grad_norm": 0.4809342324733734,
"learning_rate": 0.00016,
"loss": 2.9178,
"step": 4
},
{
"epoch": 0.014074595355383532,
"grad_norm": 0.4055400490760803,
"learning_rate": 0.0002,
"loss": 2.9431,
"step": 5
},
{
"epoch": 0.01688951442646024,
"grad_norm": 0.4463144838809967,
"learning_rate": 0.00019979899497487438,
"loss": 2.7718,
"step": 6
},
{
"epoch": 0.019704433497536946,
"grad_norm": 0.49435216188430786,
"learning_rate": 0.00019959798994974876,
"loss": 2.7197,
"step": 7
},
{
"epoch": 0.022519352568613652,
"grad_norm": 0.5844013690948486,
"learning_rate": 0.00019939698492462313,
"loss": 2.7076,
"step": 8
},
{
"epoch": 0.025334271639690358,
"grad_norm": 0.6181543469429016,
"learning_rate": 0.0001991959798994975,
"loss": 2.8055,
"step": 9
},
{
"epoch": 0.028149190710767064,
"grad_norm": 0.7984749674797058,
"learning_rate": 0.00019899497487437187,
"loss": 2.7629,
"step": 10
},
{
"epoch": 0.03096410978184377,
"grad_norm": 0.6961840391159058,
"learning_rate": 0.00019879396984924622,
"loss": 2.6917,
"step": 11
},
{
"epoch": 0.03377902885292048,
"grad_norm": 0.837709367275238,
"learning_rate": 0.00019859296482412062,
"loss": 2.6108,
"step": 12
},
{
"epoch": 0.036593947923997186,
"grad_norm": 0.7435119152069092,
"learning_rate": 0.000198391959798995,
"loss": 2.7828,
"step": 13
},
{
"epoch": 0.03940886699507389,
"grad_norm": 0.8047707080841064,
"learning_rate": 0.00019819095477386937,
"loss": 2.7011,
"step": 14
},
{
"epoch": 0.0422237860661506,
"grad_norm": 0.9793757796287537,
"learning_rate": 0.0001979899497487437,
"loss": 2.865,
"step": 15
},
{
"epoch": 0.045038705137227304,
"grad_norm": 0.7536874413490295,
"learning_rate": 0.0001977889447236181,
"loss": 2.7922,
"step": 16
},
{
"epoch": 0.04785362420830401,
"grad_norm": 0.6820270419120789,
"learning_rate": 0.00019758793969849249,
"loss": 3.1034,
"step": 17
},
{
"epoch": 0.050668543279380716,
"grad_norm": 0.7651283144950867,
"learning_rate": 0.00019738693467336683,
"loss": 2.8736,
"step": 18
},
{
"epoch": 0.05348346235045742,
"grad_norm": 0.5590704083442688,
"learning_rate": 0.0001971859296482412,
"loss": 2.6802,
"step": 19
},
{
"epoch": 0.05629838142153413,
"grad_norm": 0.5996040105819702,
"learning_rate": 0.0001969849246231156,
"loss": 2.6352,
"step": 20
},
{
"epoch": 0.059113300492610835,
"grad_norm": 0.6097638607025146,
"learning_rate": 0.00019678391959798995,
"loss": 2.5763,
"step": 21
},
{
"epoch": 0.06192821956368754,
"grad_norm": 0.5201358795166016,
"learning_rate": 0.00019658291457286432,
"loss": 2.6492,
"step": 22
},
{
"epoch": 0.06474313863476425,
"grad_norm": 0.8090603351593018,
"learning_rate": 0.0001963819095477387,
"loss": 2.5719,
"step": 23
},
{
"epoch": 0.06755805770584096,
"grad_norm": 0.6470005512237549,
"learning_rate": 0.0001961809045226131,
"loss": 2.9084,
"step": 24
},
{
"epoch": 0.07037297677691766,
"grad_norm": 0.6126617193222046,
"learning_rate": 0.00019597989949748744,
"loss": 2.7863,
"step": 25
},
{
"epoch": 0.07318789584799437,
"grad_norm": 0.5378536581993103,
"learning_rate": 0.00019577889447236181,
"loss": 2.6437,
"step": 26
},
{
"epoch": 0.07600281491907107,
"grad_norm": 0.6851357817649841,
"learning_rate": 0.0001955778894472362,
"loss": 2.6539,
"step": 27
},
{
"epoch": 0.07881773399014778,
"grad_norm": 0.6153799295425415,
"learning_rate": 0.00019537688442211056,
"loss": 2.67,
"step": 28
},
{
"epoch": 0.08163265306122448,
"grad_norm": 0.5324752926826477,
"learning_rate": 0.00019517587939698493,
"loss": 2.6674,
"step": 29
},
{
"epoch": 0.0844475721323012,
"grad_norm": 0.5797032713890076,
"learning_rate": 0.0001949748743718593,
"loss": 2.5109,
"step": 30
},
{
"epoch": 0.08726249120337791,
"grad_norm": 0.6937679052352905,
"learning_rate": 0.00019477386934673368,
"loss": 2.7514,
"step": 31
},
{
"epoch": 0.09007741027445461,
"grad_norm": 0.6234177350997925,
"learning_rate": 0.00019457286432160805,
"loss": 2.6147,
"step": 32
},
{
"epoch": 0.09289232934553132,
"grad_norm": 0.6435564756393433,
"learning_rate": 0.00019437185929648243,
"loss": 2.6747,
"step": 33
},
{
"epoch": 0.09570724841660802,
"grad_norm": 0.6985692381858826,
"learning_rate": 0.0001941708542713568,
"loss": 2.7223,
"step": 34
},
{
"epoch": 0.09852216748768473,
"grad_norm": 0.6003565788269043,
"learning_rate": 0.00019396984924623117,
"loss": 2.6478,
"step": 35
},
{
"epoch": 0.10133708655876143,
"grad_norm": 0.7325728535652161,
"learning_rate": 0.00019376884422110552,
"loss": 2.855,
"step": 36
},
{
"epoch": 0.10415200562983815,
"grad_norm": 0.6490616798400879,
"learning_rate": 0.00019356783919597992,
"loss": 2.8664,
"step": 37
},
{
"epoch": 0.10696692470091484,
"grad_norm": 0.6137815713882446,
"learning_rate": 0.0001933668341708543,
"loss": 2.5355,
"step": 38
},
{
"epoch": 0.10978184377199156,
"grad_norm": 0.6218917369842529,
"learning_rate": 0.00019316582914572864,
"loss": 2.7318,
"step": 39
},
{
"epoch": 0.11259676284306826,
"grad_norm": 0.6341124773025513,
"learning_rate": 0.000192964824120603,
"loss": 2.5333,
"step": 40
},
{
"epoch": 0.11541168191414497,
"grad_norm": 0.5556070804595947,
"learning_rate": 0.0001927638190954774,
"loss": 2.9166,
"step": 41
},
{
"epoch": 0.11822660098522167,
"grad_norm": 0.5476509928703308,
"learning_rate": 0.00019256281407035178,
"loss": 2.8262,
"step": 42
},
{
"epoch": 0.12104152005629838,
"grad_norm": 0.8177505731582642,
"learning_rate": 0.00019236180904522613,
"loss": 2.9477,
"step": 43
},
{
"epoch": 0.12385643912737508,
"grad_norm": 0.6593706011772156,
"learning_rate": 0.0001921608040201005,
"loss": 2.8449,
"step": 44
},
{
"epoch": 0.1266713581984518,
"grad_norm": 0.59237140417099,
"learning_rate": 0.0001919597989949749,
"loss": 2.6079,
"step": 45
},
{
"epoch": 0.1294862772695285,
"grad_norm": 0.5167338252067566,
"learning_rate": 0.00019175879396984925,
"loss": 2.6834,
"step": 46
},
{
"epoch": 0.13230119634060522,
"grad_norm": 0.5484845042228699,
"learning_rate": 0.00019155778894472362,
"loss": 2.5793,
"step": 47
},
{
"epoch": 0.13511611541168192,
"grad_norm": 0.5930073261260986,
"learning_rate": 0.000191356783919598,
"loss": 2.757,
"step": 48
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.6741965413093567,
"learning_rate": 0.0001911557788944724,
"loss": 2.7182,
"step": 49
},
{
"epoch": 0.14074595355383532,
"grad_norm": 0.558120608329773,
"learning_rate": 0.00019095477386934674,
"loss": 2.6401,
"step": 50
},
{
"epoch": 0.14356087262491204,
"grad_norm": 0.6161705255508423,
"learning_rate": 0.0001907537688442211,
"loss": 2.6181,
"step": 51
},
{
"epoch": 0.14637579169598874,
"grad_norm": 0.6661592721939087,
"learning_rate": 0.00019055276381909548,
"loss": 2.7207,
"step": 52
},
{
"epoch": 0.14919071076706544,
"grad_norm": 0.5285555720329285,
"learning_rate": 0.00019035175879396986,
"loss": 2.5631,
"step": 53
},
{
"epoch": 0.15200562983814214,
"grad_norm": 0.6050645709037781,
"learning_rate": 0.00019015075376884423,
"loss": 2.4716,
"step": 54
},
{
"epoch": 0.15482054890921887,
"grad_norm": 0.6041057109832764,
"learning_rate": 0.0001899497487437186,
"loss": 2.7241,
"step": 55
},
{
"epoch": 0.15763546798029557,
"grad_norm": 0.6147128343582153,
"learning_rate": 0.00018974874371859298,
"loss": 2.6362,
"step": 56
},
{
"epoch": 0.16045038705137227,
"grad_norm": 0.5417614579200745,
"learning_rate": 0.00018954773869346732,
"loss": 2.8325,
"step": 57
},
{
"epoch": 0.16326530612244897,
"grad_norm": 0.5944551229476929,
"learning_rate": 0.00018934673366834172,
"loss": 2.6793,
"step": 58
},
{
"epoch": 0.1660802251935257,
"grad_norm": 0.6394937634468079,
"learning_rate": 0.0001891457286432161,
"loss": 2.721,
"step": 59
},
{
"epoch": 0.1688951442646024,
"grad_norm": 0.5581662058830261,
"learning_rate": 0.00018894472361809047,
"loss": 2.6786,
"step": 60
},
{
"epoch": 0.1717100633356791,
"grad_norm": 0.5921449065208435,
"learning_rate": 0.00018874371859296481,
"loss": 2.5239,
"step": 61
},
{
"epoch": 0.17452498240675582,
"grad_norm": 0.5900184512138367,
"learning_rate": 0.00018854271356783921,
"loss": 2.6708,
"step": 62
},
{
"epoch": 0.17733990147783252,
"grad_norm": 0.6194185614585876,
"learning_rate": 0.0001883417085427136,
"loss": 2.9428,
"step": 63
},
{
"epoch": 0.18015482054890922,
"grad_norm": 0.629349410533905,
"learning_rate": 0.00018814070351758793,
"loss": 2.6705,
"step": 64
},
{
"epoch": 0.18296973961998592,
"grad_norm": 0.5497152805328369,
"learning_rate": 0.0001879396984924623,
"loss": 2.8205,
"step": 65
},
{
"epoch": 0.18578465869106264,
"grad_norm": 0.5276259779930115,
"learning_rate": 0.0001877386934673367,
"loss": 2.5922,
"step": 66
},
{
"epoch": 0.18859957776213934,
"grad_norm": 0.7193230390548706,
"learning_rate": 0.00018753768844221108,
"loss": 2.867,
"step": 67
},
{
"epoch": 0.19141449683321604,
"grad_norm": 0.6483210325241089,
"learning_rate": 0.00018733668341708543,
"loss": 2.9455,
"step": 68
},
{
"epoch": 0.19422941590429274,
"grad_norm": 0.7181980013847351,
"learning_rate": 0.0001871356783919598,
"loss": 2.7443,
"step": 69
},
{
"epoch": 0.19704433497536947,
"grad_norm": 0.6001389026641846,
"learning_rate": 0.0001869346733668342,
"loss": 2.6464,
"step": 70
},
{
"epoch": 0.19985925404644617,
"grad_norm": 0.7344582080841064,
"learning_rate": 0.00018673366834170854,
"loss": 2.6694,
"step": 71
},
{
"epoch": 0.20267417311752287,
"grad_norm": 0.6493490934371948,
"learning_rate": 0.00018653266331658292,
"loss": 2.6506,
"step": 72
},
{
"epoch": 0.20548909218859956,
"grad_norm": 0.5350422859191895,
"learning_rate": 0.0001863316582914573,
"loss": 2.6709,
"step": 73
},
{
"epoch": 0.2083040112596763,
"grad_norm": 0.5754289031028748,
"learning_rate": 0.0001861306532663317,
"loss": 2.3618,
"step": 74
},
{
"epoch": 0.211118930330753,
"grad_norm": 0.6207188367843628,
"learning_rate": 0.00018592964824120604,
"loss": 2.7498,
"step": 75
},
{
"epoch": 0.2139338494018297,
"grad_norm": 0.5524656176567078,
"learning_rate": 0.0001857286432160804,
"loss": 2.4996,
"step": 76
},
{
"epoch": 0.21674876847290642,
"grad_norm": 0.7466227412223816,
"learning_rate": 0.00018552763819095478,
"loss": 2.944,
"step": 77
},
{
"epoch": 0.21956368754398312,
"grad_norm": 0.6438124179840088,
"learning_rate": 0.00018532663316582915,
"loss": 2.7136,
"step": 78
},
{
"epoch": 0.22237860661505982,
"grad_norm": 0.6562415957450867,
"learning_rate": 0.00018512562814070353,
"loss": 2.797,
"step": 79
},
{
"epoch": 0.22519352568613651,
"grad_norm": 0.6599562168121338,
"learning_rate": 0.0001849246231155779,
"loss": 2.511,
"step": 80
},
{
"epoch": 0.22800844475721324,
"grad_norm": 0.6012830138206482,
"learning_rate": 0.00018472361809045227,
"loss": 2.5117,
"step": 81
},
{
"epoch": 0.23082336382828994,
"grad_norm": 0.5618470907211304,
"learning_rate": 0.00018452261306532662,
"loss": 2.7258,
"step": 82
},
{
"epoch": 0.23363828289936664,
"grad_norm": 0.7711282968521118,
"learning_rate": 0.00018432160804020102,
"loss": 2.8518,
"step": 83
},
{
"epoch": 0.23645320197044334,
"grad_norm": 0.5676078200340271,
"learning_rate": 0.0001841206030150754,
"loss": 2.7253,
"step": 84
},
{
"epoch": 0.23926812104152007,
"grad_norm": 0.8567176461219788,
"learning_rate": 0.00018391959798994977,
"loss": 2.767,
"step": 85
},
{
"epoch": 0.24208304011259676,
"grad_norm": 0.5816414952278137,
"learning_rate": 0.0001837185929648241,
"loss": 2.5211,
"step": 86
},
{
"epoch": 0.24489795918367346,
"grad_norm": 0.5357186198234558,
"learning_rate": 0.0001835175879396985,
"loss": 2.5882,
"step": 87
},
{
"epoch": 0.24771287825475016,
"grad_norm": 0.5406627655029297,
"learning_rate": 0.00018331658291457288,
"loss": 2.3529,
"step": 88
},
{
"epoch": 0.25052779732582686,
"grad_norm": 0.9183681607246399,
"learning_rate": 0.00018311557788944723,
"loss": 2.9405,
"step": 89
},
{
"epoch": 0.2533427163969036,
"grad_norm": 0.5938777327537537,
"learning_rate": 0.0001829145728643216,
"loss": 2.8762,
"step": 90
},
{
"epoch": 0.2561576354679803,
"grad_norm": 0.559532880783081,
"learning_rate": 0.000182713567839196,
"loss": 2.4396,
"step": 91
},
{
"epoch": 0.258972554539057,
"grad_norm": 0.8062023520469666,
"learning_rate": 0.00018251256281407038,
"loss": 2.8835,
"step": 92
},
{
"epoch": 0.2617874736101337,
"grad_norm": 0.5407679080963135,
"learning_rate": 0.00018231155778894472,
"loss": 2.5692,
"step": 93
},
{
"epoch": 0.26460239268121044,
"grad_norm": 0.5537972450256348,
"learning_rate": 0.0001821105527638191,
"loss": 2.4051,
"step": 94
},
{
"epoch": 0.2674173117522871,
"grad_norm": 0.6128715872764587,
"learning_rate": 0.0001819095477386935,
"loss": 2.6613,
"step": 95
},
{
"epoch": 0.27023223082336384,
"grad_norm": 0.9666823148727417,
"learning_rate": 0.00018170854271356784,
"loss": 2.6943,
"step": 96
},
{
"epoch": 0.2730471498944405,
"grad_norm": 0.587451696395874,
"learning_rate": 0.00018150753768844221,
"loss": 2.5654,
"step": 97
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.6436663269996643,
"learning_rate": 0.0001813065326633166,
"loss": 2.5957,
"step": 98
},
{
"epoch": 0.27867698803659396,
"grad_norm": 0.5880750417709351,
"learning_rate": 0.00018110552763819096,
"loss": 2.8445,
"step": 99
},
{
"epoch": 0.28149190710767064,
"grad_norm": 0.5972994565963745,
"learning_rate": 0.00018090452261306533,
"loss": 2.6186,
"step": 100
},
{
"epoch": 0.28430682617874736,
"grad_norm": 0.5434820652008057,
"learning_rate": 0.0001807035175879397,
"loss": 2.8376,
"step": 101
},
{
"epoch": 0.2871217452498241,
"grad_norm": 0.5735207200050354,
"learning_rate": 0.00018050251256281408,
"loss": 2.4118,
"step": 102
},
{
"epoch": 0.28993666432090076,
"grad_norm": 0.5313388705253601,
"learning_rate": 0.00018030150753768845,
"loss": 2.468,
"step": 103
},
{
"epoch": 0.2927515833919775,
"grad_norm": 0.6161223649978638,
"learning_rate": 0.00018010050251256282,
"loss": 2.663,
"step": 104
},
{
"epoch": 0.2955665024630542,
"grad_norm": 0.5644655227661133,
"learning_rate": 0.0001798994974874372,
"loss": 2.3672,
"step": 105
},
{
"epoch": 0.2983814215341309,
"grad_norm": 0.6080154776573181,
"learning_rate": 0.00017969849246231157,
"loss": 2.6672,
"step": 106
},
{
"epoch": 0.3011963406052076,
"grad_norm": 0.5323423147201538,
"learning_rate": 0.00017949748743718592,
"loss": 2.8084,
"step": 107
},
{
"epoch": 0.3040112596762843,
"grad_norm": 0.5441535711288452,
"learning_rate": 0.00017929648241206032,
"loss": 2.5269,
"step": 108
},
{
"epoch": 0.306826178747361,
"grad_norm": 0.5068178772926331,
"learning_rate": 0.0001790954773869347,
"loss": 2.5472,
"step": 109
},
{
"epoch": 0.30964109781843774,
"grad_norm": 0.6056650876998901,
"learning_rate": 0.00017889447236180906,
"loss": 2.6083,
"step": 110
},
{
"epoch": 0.3124560168895144,
"grad_norm": 0.5633851885795593,
"learning_rate": 0.0001786934673366834,
"loss": 2.5353,
"step": 111
},
{
"epoch": 0.31527093596059114,
"grad_norm": 0.6467467546463013,
"learning_rate": 0.0001784924623115578,
"loss": 2.7402,
"step": 112
},
{
"epoch": 0.31808585503166786,
"grad_norm": 0.590074360370636,
"learning_rate": 0.00017829145728643218,
"loss": 2.7417,
"step": 113
},
{
"epoch": 0.32090077410274453,
"grad_norm": 0.5952100157737732,
"learning_rate": 0.00017809045226130653,
"loss": 2.4225,
"step": 114
},
{
"epoch": 0.32371569317382126,
"grad_norm": 0.5567030310630798,
"learning_rate": 0.0001778894472361809,
"loss": 2.5335,
"step": 115
},
{
"epoch": 0.32653061224489793,
"grad_norm": 0.6068913340568542,
"learning_rate": 0.0001776884422110553,
"loss": 2.6689,
"step": 116
},
{
"epoch": 0.32934553131597466,
"grad_norm": 0.5481736660003662,
"learning_rate": 0.00017748743718592967,
"loss": 2.5618,
"step": 117
},
{
"epoch": 0.3321604503870514,
"grad_norm": 0.5849531888961792,
"learning_rate": 0.00017728643216080402,
"loss": 2.8453,
"step": 118
},
{
"epoch": 0.33497536945812806,
"grad_norm": 0.6313461065292358,
"learning_rate": 0.0001770854271356784,
"loss": 2.6883,
"step": 119
},
{
"epoch": 0.3377902885292048,
"grad_norm": 1.3009490966796875,
"learning_rate": 0.0001768844221105528,
"loss": 2.5748,
"step": 120
},
{
"epoch": 0.3406052076002815,
"grad_norm": 0.5591140985488892,
"learning_rate": 0.00017668341708542714,
"loss": 2.7251,
"step": 121
},
{
"epoch": 0.3434201266713582,
"grad_norm": 0.5167106986045837,
"learning_rate": 0.0001764824120603015,
"loss": 2.6753,
"step": 122
},
{
"epoch": 0.3462350457424349,
"grad_norm": 0.6912369132041931,
"learning_rate": 0.00017628140703517588,
"loss": 2.9036,
"step": 123
},
{
"epoch": 0.34904996481351164,
"grad_norm": 0.7323533892631531,
"learning_rate": 0.00017608040201005026,
"loss": 2.7168,
"step": 124
},
{
"epoch": 0.3518648838845883,
"grad_norm": 0.5843552947044373,
"learning_rate": 0.00017587939698492463,
"loss": 2.5335,
"step": 125
},
{
"epoch": 0.35467980295566504,
"grad_norm": 0.6250912547111511,
"learning_rate": 0.000175678391959799,
"loss": 2.7493,
"step": 126
},
{
"epoch": 0.3574947220267417,
"grad_norm": 0.5447134375572205,
"learning_rate": 0.00017547738693467338,
"loss": 2.5758,
"step": 127
},
{
"epoch": 0.36030964109781843,
"grad_norm": 0.8142397403717041,
"learning_rate": 0.00017527638190954775,
"loss": 2.8021,
"step": 128
},
{
"epoch": 0.36312456016889516,
"grad_norm": NaN,
"learning_rate": 0.00017527638190954775,
"loss": 2.6983,
"step": 129
},
{
"epoch": 0.36593947923997183,
"grad_norm": 0.5528063774108887,
"learning_rate": 0.00017507537688442212,
"loss": 2.695,
"step": 130
},
{
"epoch": 0.36875439831104856,
"grad_norm": 0.605383574962616,
"learning_rate": 0.0001748743718592965,
"loss": 2.7013,
"step": 131
},
{
"epoch": 0.3715693173821253,
"grad_norm": 0.627310037612915,
"learning_rate": 0.00017467336683417087,
"loss": 2.7744,
"step": 132
},
{
"epoch": 0.37438423645320196,
"grad_norm": 0.6117985844612122,
"learning_rate": 0.00017447236180904521,
"loss": 2.9001,
"step": 133
},
{
"epoch": 0.3771991555242787,
"grad_norm": 0.5570118427276611,
"learning_rate": 0.00017427135678391961,
"loss": 2.6795,
"step": 134
},
{
"epoch": 0.3800140745953554,
"grad_norm": 0.6382287740707397,
"learning_rate": 0.000174070351758794,
"loss": 2.8177,
"step": 135
},
{
"epoch": 0.3828289936664321,
"grad_norm": 0.7003315091133118,
"learning_rate": 0.00017386934673366836,
"loss": 2.531,
"step": 136
},
{
"epoch": 0.3856439127375088,
"grad_norm": 0.5270616412162781,
"learning_rate": 0.0001736683417085427,
"loss": 2.7267,
"step": 137
},
{
"epoch": 0.3884588318085855,
"grad_norm": 0.6856080889701843,
"learning_rate": 0.0001734673366834171,
"loss": 2.8481,
"step": 138
},
{
"epoch": 0.3912737508796622,
"grad_norm": 0.7767484784126282,
"learning_rate": 0.00017326633165829148,
"loss": 2.7403,
"step": 139
},
{
"epoch": 0.39408866995073893,
"grad_norm": 0.5755979418754578,
"learning_rate": 0.00017306532663316582,
"loss": 2.6526,
"step": 140
},
{
"epoch": 0.3969035890218156,
"grad_norm": 0.6108975410461426,
"learning_rate": 0.0001728643216080402,
"loss": 2.5531,
"step": 141
},
{
"epoch": 0.39971850809289233,
"grad_norm": 0.6080026030540466,
"learning_rate": 0.0001726633165829146,
"loss": 2.7344,
"step": 142
},
{
"epoch": 0.40253342716396906,
"grad_norm": 0.5954862833023071,
"learning_rate": 0.00017246231155778897,
"loss": 2.6179,
"step": 143
},
{
"epoch": 0.40534834623504573,
"grad_norm": 0.7604647874832153,
"learning_rate": 0.00017226130653266332,
"loss": 2.8556,
"step": 144
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.540407657623291,
"learning_rate": 0.0001720603015075377,
"loss": 2.671,
"step": 145
},
{
"epoch": 0.41097818437719913,
"grad_norm": 0.5598605871200562,
"learning_rate": 0.00017185929648241206,
"loss": 2.697,
"step": 146
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.6206982135772705,
"learning_rate": 0.00017165829145728644,
"loss": 2.5687,
"step": 147
},
{
"epoch": 0.4166080225193526,
"grad_norm": 0.5486766695976257,
"learning_rate": 0.0001714572864321608,
"loss": 2.8242,
"step": 148
},
{
"epoch": 0.41942294159042925,
"grad_norm": 0.7132663130760193,
"learning_rate": 0.00017125628140703518,
"loss": 2.8706,
"step": 149
},
{
"epoch": 0.422237860661506,
"grad_norm": 0.6359018087387085,
"learning_rate": 0.00017105527638190955,
"loss": 2.773,
"step": 150
},
{
"epoch": 0.4250527797325827,
"grad_norm": 0.5943129062652588,
"learning_rate": 0.00017085427135678393,
"loss": 2.6535,
"step": 151
},
{
"epoch": 0.4278676988036594,
"grad_norm": 0.6567736864089966,
"learning_rate": 0.0001706532663316583,
"loss": 2.6059,
"step": 152
},
{
"epoch": 0.4306826178747361,
"grad_norm": 0.6345821619033813,
"learning_rate": 0.00017045226130653267,
"loss": 2.5776,
"step": 153
},
{
"epoch": 0.43349753694581283,
"grad_norm": 0.9386352896690369,
"learning_rate": 0.00017025125628140705,
"loss": 2.7449,
"step": 154
},
{
"epoch": 0.4363124560168895,
"grad_norm": 0.5455414652824402,
"learning_rate": 0.00017005025125628142,
"loss": 2.3967,
"step": 155
},
{
"epoch": 0.43912737508796623,
"grad_norm": 0.7040349841117859,
"learning_rate": 0.0001698492462311558,
"loss": 2.9214,
"step": 156
},
{
"epoch": 0.4419422941590429,
"grad_norm": 0.5507174730300903,
"learning_rate": 0.00016964824120603016,
"loss": 2.5087,
"step": 157
},
{
"epoch": 0.44475721323011963,
"grad_norm": 0.6239134669303894,
"learning_rate": 0.0001694472361809045,
"loss": 3.2032,
"step": 158
},
{
"epoch": 0.44757213230119636,
"grad_norm": 0.7403885722160339,
"learning_rate": 0.0001692462311557789,
"loss": 2.8829,
"step": 159
},
{
"epoch": 0.45038705137227303,
"grad_norm": 0.5260657072067261,
"learning_rate": 0.00016904522613065328,
"loss": 2.4572,
"step": 160
},
{
"epoch": 0.45320197044334976,
"grad_norm": 0.7505115270614624,
"learning_rate": 0.00016884422110552766,
"loss": 2.7264,
"step": 161
},
{
"epoch": 0.4560168895144265,
"grad_norm": 0.6088585257530212,
"learning_rate": 0.000168643216080402,
"loss": 2.5714,
"step": 162
},
{
"epoch": 0.45883180858550315,
"grad_norm": 0.6011828184127808,
"learning_rate": 0.0001684422110552764,
"loss": 2.597,
"step": 163
},
{
"epoch": 0.4616467276565799,
"grad_norm": 0.5229634046554565,
"learning_rate": 0.00016824120603015078,
"loss": 2.723,
"step": 164
},
{
"epoch": 0.4644616467276566,
"grad_norm": 0.6184930801391602,
"learning_rate": 0.00016804020100502512,
"loss": 2.703,
"step": 165
},
{
"epoch": 0.4672765657987333,
"grad_norm": 0.6252800226211548,
"learning_rate": 0.0001678391959798995,
"loss": 2.5517,
"step": 166
},
{
"epoch": 0.47009148486981,
"grad_norm": 0.5429969429969788,
"learning_rate": 0.0001676381909547739,
"loss": 2.7248,
"step": 167
},
{
"epoch": 0.4729064039408867,
"grad_norm": 0.6234527230262756,
"learning_rate": 0.00016743718592964827,
"loss": 2.5844,
"step": 168
},
{
"epoch": 0.4757213230119634,
"grad_norm": 0.6872987747192383,
"learning_rate": 0.0001672361809045226,
"loss": 2.5067,
"step": 169
},
{
"epoch": 0.47853624208304013,
"grad_norm": 0.5591785907745361,
"learning_rate": 0.00016703517587939699,
"loss": 2.4999,
"step": 170
},
{
"epoch": 0.4813511611541168,
"grad_norm": 0.5767291188240051,
"learning_rate": 0.00016683417085427136,
"loss": 2.8885,
"step": 171
},
{
"epoch": 0.48416608022519353,
"grad_norm": 0.6422219276428223,
"learning_rate": 0.00016663316582914573,
"loss": 2.6989,
"step": 172
},
{
"epoch": 0.48698099929627026,
"grad_norm": 0.6035985350608826,
"learning_rate": 0.0001664321608040201,
"loss": 2.6412,
"step": 173
},
{
"epoch": 0.4897959183673469,
"grad_norm": 0.5744962096214294,
"learning_rate": 0.00016623115577889448,
"loss": 2.6395,
"step": 174
},
{
"epoch": 0.49261083743842365,
"grad_norm": 0.6550725698471069,
"learning_rate": 0.00016603015075376885,
"loss": 2.7526,
"step": 175
},
{
"epoch": 0.4954257565095003,
"grad_norm": 0.7883411049842834,
"learning_rate": 0.00016582914572864322,
"loss": 2.5775,
"step": 176
},
{
"epoch": 0.49824067558057705,
"grad_norm": 0.6014293432235718,
"learning_rate": 0.0001656281407035176,
"loss": 2.6568,
"step": 177
},
{
"epoch": 0.5010555946516537,
"grad_norm": 0.5285369157791138,
"learning_rate": 0.00016542713567839197,
"loss": 2.4862,
"step": 178
},
{
"epoch": 0.5038705137227305,
"grad_norm": 0.6501176953315735,
"learning_rate": 0.00016522613065326634,
"loss": 2.631,
"step": 179
},
{
"epoch": 0.5066854327938072,
"grad_norm": 0.6903632879257202,
"learning_rate": 0.00016502512562814072,
"loss": 2.7363,
"step": 180
},
{
"epoch": 0.5095003518648838,
"grad_norm": 0.6202127933502197,
"learning_rate": 0.0001648241206030151,
"loss": 2.6678,
"step": 181
},
{
"epoch": 0.5123152709359606,
"grad_norm": 0.688332200050354,
"learning_rate": 0.00016462311557788946,
"loss": 2.6346,
"step": 182
},
{
"epoch": 0.5151301900070373,
"grad_norm": 0.5428361892700195,
"learning_rate": 0.0001644221105527638,
"loss": 2.9607,
"step": 183
},
{
"epoch": 0.517945109078114,
"grad_norm": 0.5395454168319702,
"learning_rate": 0.0001642211055276382,
"loss": 2.8427,
"step": 184
},
{
"epoch": 0.5207600281491908,
"grad_norm": 0.554793655872345,
"learning_rate": 0.00016402010050251258,
"loss": 2.4523,
"step": 185
},
{
"epoch": 0.5235749472202674,
"grad_norm": 0.5698427557945251,
"learning_rate": 0.00016381909547738695,
"loss": 2.7052,
"step": 186
},
{
"epoch": 0.5263898662913441,
"grad_norm": 0.5099812150001526,
"learning_rate": 0.0001636180904522613,
"loss": 2.3651,
"step": 187
},
{
"epoch": 0.5292047853624209,
"grad_norm": 0.5726649761199951,
"learning_rate": 0.0001634170854271357,
"loss": 2.9888,
"step": 188
},
{
"epoch": 0.5320197044334976,
"grad_norm": 0.630757212638855,
"learning_rate": 0.00016321608040201007,
"loss": 2.6886,
"step": 189
},
{
"epoch": 0.5348346235045742,
"grad_norm": 0.5425901412963867,
"learning_rate": 0.00016301507537688442,
"loss": 2.6047,
"step": 190
},
{
"epoch": 0.5376495425756509,
"grad_norm": 0.6030427813529968,
"learning_rate": 0.0001628140703517588,
"loss": 2.8713,
"step": 191
},
{
"epoch": 0.5404644616467277,
"grad_norm": 0.5966265201568604,
"learning_rate": 0.00016261306532663316,
"loss": 2.7212,
"step": 192
},
{
"epoch": 0.5432793807178043,
"grad_norm": 0.5909569263458252,
"learning_rate": 0.00016241206030150756,
"loss": 2.8696,
"step": 193
},
{
"epoch": 0.546094299788881,
"grad_norm": 0.5485597252845764,
"learning_rate": 0.0001622110552763819,
"loss": 2.6367,
"step": 194
},
{
"epoch": 0.5489092188599578,
"grad_norm": 0.6274406909942627,
"learning_rate": 0.00016201005025125628,
"loss": 2.6882,
"step": 195
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.5513599514961243,
"learning_rate": 0.00016180904522613066,
"loss": 2.6596,
"step": 196
},
{
"epoch": 0.5545390570021111,
"grad_norm": 0.7499818801879883,
"learning_rate": 0.00016160804020100503,
"loss": 2.6191,
"step": 197
},
{
"epoch": 0.5573539760731879,
"grad_norm": 0.5143455862998962,
"learning_rate": 0.0001614070351758794,
"loss": 2.6543,
"step": 198
},
{
"epoch": 0.5601688951442646,
"grad_norm": 0.6372074484825134,
"learning_rate": 0.00016120603015075378,
"loss": 2.4355,
"step": 199
},
{
"epoch": 0.5629838142153413,
"grad_norm": 0.7132628560066223,
"learning_rate": 0.00016100502512562815,
"loss": 2.671,
"step": 200
},
{
"epoch": 0.565798733286418,
"grad_norm": 0.5970779657363892,
"learning_rate": 0.00016080402010050252,
"loss": 2.6802,
"step": 201
},
{
"epoch": 0.5686136523574947,
"grad_norm": 0.6065824627876282,
"learning_rate": 0.0001606030150753769,
"loss": 2.6151,
"step": 202
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.5521674752235413,
"learning_rate": 0.00016040201005025127,
"loss": 2.5132,
"step": 203
},
{
"epoch": 0.5742434904996482,
"grad_norm": 0.6067800521850586,
"learning_rate": 0.00016020100502512564,
"loss": 2.9283,
"step": 204
},
{
"epoch": 0.5770584095707249,
"grad_norm": 0.5979752540588379,
"learning_rate": 0.00016,
"loss": 2.5355,
"step": 205
},
{
"epoch": 0.5798733286418015,
"grad_norm": 0.6044461727142334,
"learning_rate": 0.00015979899497487439,
"loss": 2.6752,
"step": 206
},
{
"epoch": 0.5826882477128783,
"grad_norm": 0.580636739730835,
"learning_rate": 0.00015959798994974876,
"loss": 2.538,
"step": 207
},
{
"epoch": 0.585503166783955,
"grad_norm": 0.6181825995445251,
"learning_rate": 0.0001593969849246231,
"loss": 2.6529,
"step": 208
},
{
"epoch": 0.5883180858550316,
"grad_norm": 0.6641463041305542,
"learning_rate": 0.0001591959798994975,
"loss": 2.6252,
"step": 209
},
{
"epoch": 0.5911330049261084,
"grad_norm": 0.6233858466148376,
"learning_rate": 0.00015899497487437188,
"loss": 2.6832,
"step": 210
},
{
"epoch": 0.5939479239971851,
"grad_norm": 0.6696732044219971,
"learning_rate": 0.00015879396984924625,
"loss": 3.0325,
"step": 211
},
{
"epoch": 0.5967628430682618,
"grad_norm": 0.7569646239280701,
"learning_rate": 0.0001585929648241206,
"loss": 2.675,
"step": 212
},
{
"epoch": 0.5995777621393384,
"grad_norm": 0.5962279438972473,
"learning_rate": 0.000158391959798995,
"loss": 2.6369,
"step": 213
},
{
"epoch": 0.6023926812104152,
"grad_norm": 0.6349969506263733,
"learning_rate": 0.00015819095477386937,
"loss": 2.5531,
"step": 214
},
{
"epoch": 0.6052076002814919,
"grad_norm": 0.8234291076660156,
"learning_rate": 0.00015798994974874372,
"loss": 2.5466,
"step": 215
},
{
"epoch": 0.6080225193525686,
"grad_norm": 0.6057316660881042,
"learning_rate": 0.0001577889447236181,
"loss": 2.3296,
"step": 216
},
{
"epoch": 0.6108374384236454,
"grad_norm": 0.6568176746368408,
"learning_rate": 0.00015758793969849246,
"loss": 2.8075,
"step": 217
},
{
"epoch": 0.613652357494722,
"grad_norm": 0.5945923328399658,
"learning_rate": 0.00015738693467336686,
"loss": 2.696,
"step": 218
},
{
"epoch": 0.6164672765657987,
"grad_norm": 0.6226676106452942,
"learning_rate": 0.0001571859296482412,
"loss": 2.5764,
"step": 219
},
{
"epoch": 0.6192821956368755,
"grad_norm": 0.6158185601234436,
"learning_rate": 0.00015698492462311558,
"loss": 2.66,
"step": 220
},
{
"epoch": 0.6220971147079521,
"grad_norm": 0.7033487558364868,
"learning_rate": 0.00015678391959798995,
"loss": 2.7747,
"step": 221
},
{
"epoch": 0.6249120337790288,
"grad_norm": 0.5215992331504822,
"learning_rate": 0.00015658291457286433,
"loss": 2.4176,
"step": 222
},
{
"epoch": 0.6277269528501056,
"grad_norm": 0.8559087514877319,
"learning_rate": 0.0001563819095477387,
"loss": 2.8081,
"step": 223
},
{
"epoch": 0.6305418719211823,
"grad_norm": 0.5106130242347717,
"learning_rate": 0.00015618090452261307,
"loss": 2.6433,
"step": 224
},
{
"epoch": 0.633356790992259,
"grad_norm": 0.6176455020904541,
"learning_rate": 0.00015597989949748745,
"loss": 2.4351,
"step": 225
},
{
"epoch": 0.6361717100633357,
"grad_norm": 0.8193095922470093,
"learning_rate": 0.00015577889447236182,
"loss": 2.8882,
"step": 226
},
{
"epoch": 0.6389866291344124,
"grad_norm": 0.8569721579551697,
"learning_rate": 0.0001555778894472362,
"loss": 2.7263,
"step": 227
},
{
"epoch": 0.6418015482054891,
"grad_norm": 0.6688103079795837,
"learning_rate": 0.00015537688442211056,
"loss": 2.6202,
"step": 228
},
{
"epoch": 0.6446164672765659,
"grad_norm": 0.6070395708084106,
"learning_rate": 0.00015517587939698494,
"loss": 2.7117,
"step": 229
},
{
"epoch": 0.6474313863476425,
"grad_norm": 0.7812969088554382,
"learning_rate": 0.0001549748743718593,
"loss": 2.5815,
"step": 230
},
{
"epoch": 0.6502463054187192,
"grad_norm": 0.8165440559387207,
"learning_rate": 0.00015477386934673368,
"loss": 2.8503,
"step": 231
},
{
"epoch": 0.6530612244897959,
"grad_norm": 0.6454505324363708,
"learning_rate": 0.00015457286432160806,
"loss": 2.5918,
"step": 232
},
{
"epoch": 0.6558761435608726,
"grad_norm": 0.7109069228172302,
"learning_rate": 0.0001543718592964824,
"loss": 2.5554,
"step": 233
},
{
"epoch": 0.6586910626319493,
"grad_norm": 0.6079565286636353,
"learning_rate": 0.0001541708542713568,
"loss": 2.6232,
"step": 234
},
{
"epoch": 0.661505981703026,
"grad_norm": 0.576082170009613,
"learning_rate": 0.00015396984924623117,
"loss": 2.8387,
"step": 235
},
{
"epoch": 0.6643209007741028,
"grad_norm": 0.5683891177177429,
"learning_rate": 0.00015376884422110555,
"loss": 2.7391,
"step": 236
},
{
"epoch": 0.6671358198451794,
"grad_norm": 0.6114887595176697,
"learning_rate": 0.0001535678391959799,
"loss": 2.6629,
"step": 237
},
{
"epoch": 0.6699507389162561,
"grad_norm": 0.6666116118431091,
"learning_rate": 0.00015336683417085427,
"loss": 2.9027,
"step": 238
},
{
"epoch": 0.6727656579873329,
"grad_norm": 0.5646522641181946,
"learning_rate": 0.00015316582914572867,
"loss": 2.7252,
"step": 239
},
{
"epoch": 0.6755805770584096,
"grad_norm": 0.6885817646980286,
"learning_rate": 0.000152964824120603,
"loss": 2.5966,
"step": 240
},
{
"epoch": 0.6783954961294862,
"grad_norm": 0.5778309106826782,
"learning_rate": 0.00015276381909547739,
"loss": 2.6275,
"step": 241
},
{
"epoch": 0.681210415200563,
"grad_norm": 0.6230787038803101,
"learning_rate": 0.00015256281407035176,
"loss": 2.5258,
"step": 242
},
{
"epoch": 0.6840253342716397,
"grad_norm": 0.5411630272865295,
"learning_rate": 0.00015236180904522613,
"loss": 2.6047,
"step": 243
},
{
"epoch": 0.6868402533427164,
"grad_norm": 0.5547896027565002,
"learning_rate": 0.0001521608040201005,
"loss": 2.6451,
"step": 244
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.6721991300582886,
"learning_rate": 0.00015195979899497488,
"loss": 2.7127,
"step": 245
},
{
"epoch": 0.6924700914848698,
"grad_norm": 0.6402304172515869,
"learning_rate": 0.00015175879396984925,
"loss": 2.8568,
"step": 246
},
{
"epoch": 0.6952850105559465,
"grad_norm": 0.594251275062561,
"learning_rate": 0.00015155778894472362,
"loss": 2.7168,
"step": 247
},
{
"epoch": 0.6980999296270233,
"grad_norm": 0.8853170871734619,
"learning_rate": 0.000151356783919598,
"loss": 2.7259,
"step": 248
},
{
"epoch": 0.7009148486981,
"grad_norm": 0.5619581341743469,
"learning_rate": 0.00015115577889447237,
"loss": 2.6218,
"step": 249
},
{
"epoch": 0.7037297677691766,
"grad_norm": 0.6149075031280518,
"learning_rate": 0.00015095477386934674,
"loss": 2.8984,
"step": 250
},
{
"epoch": 0.7065446868402533,
"grad_norm": 0.6819274425506592,
"learning_rate": 0.00015075376884422112,
"loss": 2.8293,
"step": 251
},
{
"epoch": 0.7093596059113301,
"grad_norm": 0.5911348462104797,
"learning_rate": 0.0001505527638190955,
"loss": 2.6333,
"step": 252
},
{
"epoch": 0.7121745249824067,
"grad_norm": 0.7064481973648071,
"learning_rate": 0.00015035175879396986,
"loss": 2.8061,
"step": 253
},
{
"epoch": 0.7149894440534834,
"grad_norm": 0.6039316654205322,
"learning_rate": 0.00015015075376884423,
"loss": 2.6468,
"step": 254
},
{
"epoch": 0.7178043631245602,
"grad_norm": 0.5624644756317139,
"learning_rate": 0.0001499497487437186,
"loss": 2.7813,
"step": 255
},
{
"epoch": 0.7206192821956369,
"grad_norm": 0.5971612334251404,
"learning_rate": 0.00014974874371859298,
"loss": 2.6411,
"step": 256
},
{
"epoch": 0.7234342012667135,
"grad_norm": 0.6717031598091125,
"learning_rate": 0.00014954773869346735,
"loss": 2.5579,
"step": 257
},
{
"epoch": 0.7262491203377903,
"grad_norm": 0.6643320322036743,
"learning_rate": 0.0001493467336683417,
"loss": 2.8469,
"step": 258
},
{
"epoch": 0.729064039408867,
"grad_norm": 0.5971053838729858,
"learning_rate": 0.0001491457286432161,
"loss": 2.613,
"step": 259
},
{
"epoch": 0.7318789584799437,
"grad_norm": 0.6267710328102112,
"learning_rate": 0.00014894472361809047,
"loss": 2.6969,
"step": 260
},
{
"epoch": 0.7346938775510204,
"grad_norm": 0.6237425804138184,
"learning_rate": 0.00014874371859296482,
"loss": 2.7047,
"step": 261
},
{
"epoch": 0.7375087966220971,
"grad_norm": 0.5603229999542236,
"learning_rate": 0.0001485427135678392,
"loss": 2.7534,
"step": 262
},
{
"epoch": 0.7403237156931738,
"grad_norm": 0.6484439969062805,
"learning_rate": 0.00014834170854271356,
"loss": 2.9389,
"step": 263
},
{
"epoch": 0.7431386347642506,
"grad_norm": 0.6225891709327698,
"learning_rate": 0.00014814070351758796,
"loss": 2.7516,
"step": 264
},
{
"epoch": 0.7459535538353272,
"grad_norm": 0.5303828716278076,
"learning_rate": 0.0001479396984924623,
"loss": 2.3335,
"step": 265
},
{
"epoch": 0.7487684729064039,
"grad_norm": 0.6280227303504944,
"learning_rate": 0.00014773869346733668,
"loss": 2.6259,
"step": 266
},
{
"epoch": 0.7515833919774807,
"grad_norm": 0.5551609992980957,
"learning_rate": 0.00014753768844221106,
"loss": 2.6572,
"step": 267
},
{
"epoch": 0.7543983110485574,
"grad_norm": 0.7833865284919739,
"learning_rate": 0.00014733668341708543,
"loss": 2.7358,
"step": 268
},
{
"epoch": 0.757213230119634,
"grad_norm": 0.6138265132904053,
"learning_rate": 0.0001471356783919598,
"loss": 2.4282,
"step": 269
},
{
"epoch": 0.7600281491907108,
"grad_norm": 0.6331743001937866,
"learning_rate": 0.00014693467336683417,
"loss": 2.6373,
"step": 270
},
{
"epoch": 0.7628430682617875,
"grad_norm": 0.569272518157959,
"learning_rate": 0.00014673366834170855,
"loss": 2.6546,
"step": 271
},
{
"epoch": 0.7656579873328642,
"grad_norm": 0.6755379438400269,
"learning_rate": 0.00014653266331658292,
"loss": 2.6258,
"step": 272
},
{
"epoch": 0.7684729064039408,
"grad_norm": 0.6408460140228271,
"learning_rate": 0.0001463316582914573,
"loss": 2.6824,
"step": 273
},
{
"epoch": 0.7712878254750176,
"grad_norm": 0.6325194239616394,
"learning_rate": 0.00014613065326633167,
"loss": 2.7959,
"step": 274
},
{
"epoch": 0.7741027445460943,
"grad_norm": 0.6526459455490112,
"learning_rate": 0.00014592964824120604,
"loss": 2.613,
"step": 275
},
{
"epoch": 0.776917663617171,
"grad_norm": 0.610998272895813,
"learning_rate": 0.0001457286432160804,
"loss": 2.4973,
"step": 276
},
{
"epoch": 0.7797325826882477,
"grad_norm": 0.510045051574707,
"learning_rate": 0.00014552763819095479,
"loss": 2.7343,
"step": 277
},
{
"epoch": 0.7825475017593244,
"grad_norm": 0.5863422155380249,
"learning_rate": 0.00014532663316582916,
"loss": 2.5543,
"step": 278
},
{
"epoch": 0.7853624208304011,
"grad_norm": 0.5406447649002075,
"learning_rate": 0.00014512562814070353,
"loss": 2.7748,
"step": 279
},
{
"epoch": 0.7881773399014779,
"grad_norm": 0.7465657591819763,
"learning_rate": 0.0001449246231155779,
"loss": 2.4034,
"step": 280
},
{
"epoch": 0.7909922589725545,
"grad_norm": 0.5192904472351074,
"learning_rate": 0.00014472361809045228,
"loss": 2.4881,
"step": 281
},
{
"epoch": 0.7938071780436312,
"grad_norm": 0.6085344552993774,
"learning_rate": 0.00014452261306532665,
"loss": 2.6534,
"step": 282
},
{
"epoch": 0.796622097114708,
"grad_norm": 0.6155668497085571,
"learning_rate": 0.000144321608040201,
"loss": 2.6149,
"step": 283
},
{
"epoch": 0.7994370161857847,
"grad_norm": 0.623285710811615,
"learning_rate": 0.00014412060301507537,
"loss": 2.7725,
"step": 284
},
{
"epoch": 0.8022519352568613,
"grad_norm": 0.9461747407913208,
"learning_rate": 0.00014391959798994977,
"loss": 2.7837,
"step": 285
},
{
"epoch": 0.8050668543279381,
"grad_norm": 0.7152134776115417,
"learning_rate": 0.00014371859296482411,
"loss": 2.6678,
"step": 286
},
{
"epoch": 0.8078817733990148,
"grad_norm": 0.6519983410835266,
"learning_rate": 0.0001435175879396985,
"loss": 2.5234,
"step": 287
},
{
"epoch": 0.8106966924700915,
"grad_norm": 0.6523590087890625,
"learning_rate": 0.00014331658291457286,
"loss": 2.7284,
"step": 288
},
{
"epoch": 0.8135116115411682,
"grad_norm": 0.6067202687263489,
"learning_rate": 0.00014311557788944726,
"loss": 2.5459,
"step": 289
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.5836743116378784,
"learning_rate": 0.0001429145728643216,
"loss": 2.452,
"step": 290
},
{
"epoch": 0.8191414496833216,
"grad_norm": 0.685727596282959,
"learning_rate": 0.00014271356783919598,
"loss": 2.6568,
"step": 291
},
{
"epoch": 0.8219563687543983,
"grad_norm": 0.6456769704818726,
"learning_rate": 0.00014251256281407035,
"loss": 2.6553,
"step": 292
},
{
"epoch": 0.824771287825475,
"grad_norm": 0.6357674598693848,
"learning_rate": 0.00014231155778894473,
"loss": 2.7589,
"step": 293
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.6339374780654907,
"learning_rate": 0.0001421105527638191,
"loss": 2.7797,
"step": 294
},
{
"epoch": 0.8304011259676284,
"grad_norm": 0.5491819381713867,
"learning_rate": 0.00014190954773869347,
"loss": 2.4772,
"step": 295
},
{
"epoch": 0.8332160450387052,
"grad_norm": 0.6312305331230164,
"learning_rate": 0.00014170854271356784,
"loss": 2.6715,
"step": 296
},
{
"epoch": 0.8360309641097818,
"grad_norm": 0.647985577583313,
"learning_rate": 0.00014150753768844222,
"loss": 2.7811,
"step": 297
},
{
"epoch": 0.8388458831808585,
"grad_norm": 0.6383928060531616,
"learning_rate": 0.0001413065326633166,
"loss": 2.8496,
"step": 298
},
{
"epoch": 0.8416608022519353,
"grad_norm": 0.5548710823059082,
"learning_rate": 0.00014110552763819096,
"loss": 2.5096,
"step": 299
},
{
"epoch": 0.844475721323012,
"grad_norm": 0.5331722497940063,
"learning_rate": 0.00014090452261306534,
"loss": 2.5683,
"step": 300
},
{
"epoch": 0.8472906403940886,
"grad_norm": 0.5956087112426758,
"learning_rate": 0.0001407035175879397,
"loss": 2.6175,
"step": 301
},
{
"epoch": 0.8501055594651654,
"grad_norm": 0.6151571273803711,
"learning_rate": 0.00014050251256281408,
"loss": 2.5863,
"step": 302
},
{
"epoch": 0.8529204785362421,
"grad_norm": 0.5952453017234802,
"learning_rate": 0.00014030150753768846,
"loss": 2.5152,
"step": 303
},
{
"epoch": 0.8557353976073188,
"grad_norm": 0.6127233505249023,
"learning_rate": 0.0001401005025125628,
"loss": 2.5753,
"step": 304
},
{
"epoch": 0.8585503166783955,
"grad_norm": 0.551474928855896,
"learning_rate": 0.0001398994974874372,
"loss": 2.6523,
"step": 305
},
{
"epoch": 0.8613652357494722,
"grad_norm": 0.8345268368721008,
"learning_rate": 0.00013969849246231157,
"loss": 2.78,
"step": 306
},
{
"epoch": 0.8641801548205489,
"grad_norm": 0.6494585275650024,
"learning_rate": 0.00013949748743718595,
"loss": 2.7354,
"step": 307
},
{
"epoch": 0.8669950738916257,
"grad_norm": 0.6813188791275024,
"learning_rate": 0.0001392964824120603,
"loss": 2.8736,
"step": 308
},
{
"epoch": 0.8698099929627023,
"grad_norm": 0.6250954270362854,
"learning_rate": 0.00013909547738693467,
"loss": 2.7548,
"step": 309
},
{
"epoch": 0.872624912033779,
"grad_norm": 0.6115372180938721,
"learning_rate": 0.00013889447236180907,
"loss": 2.7126,
"step": 310
},
{
"epoch": 0.8754398311048557,
"grad_norm": 0.6005333662033081,
"learning_rate": 0.0001386934673366834,
"loss": 2.9623,
"step": 311
},
{
"epoch": 0.8782547501759325,
"grad_norm": 0.5203389525413513,
"learning_rate": 0.00013849246231155778,
"loss": 2.7269,
"step": 312
},
{
"epoch": 0.8810696692470091,
"grad_norm": 0.5951765775680542,
"learning_rate": 0.00013829145728643216,
"loss": 2.6058,
"step": 313
},
{
"epoch": 0.8838845883180858,
"grad_norm": 0.6142780184745789,
"learning_rate": 0.00013809045226130656,
"loss": 2.5918,
"step": 314
},
{
"epoch": 0.8866995073891626,
"grad_norm": 0.5776972770690918,
"learning_rate": 0.0001378894472361809,
"loss": 2.5006,
"step": 315
},
{
"epoch": 0.8895144264602393,
"grad_norm": 0.6553467512130737,
"learning_rate": 0.00013768844221105528,
"loss": 2.5903,
"step": 316
},
{
"epoch": 0.8923293455313159,
"grad_norm": 0.5776195526123047,
"learning_rate": 0.00013748743718592965,
"loss": 2.5532,
"step": 317
},
{
"epoch": 0.8951442646023927,
"grad_norm": 0.5531054139137268,
"learning_rate": 0.00013728643216080402,
"loss": 2.624,
"step": 318
},
{
"epoch": 0.8979591836734694,
"grad_norm": 0.7402701377868652,
"learning_rate": 0.0001370854271356784,
"loss": 2.7686,
"step": 319
},
{
"epoch": 0.9007741027445461,
"grad_norm": 0.5394028425216675,
"learning_rate": 0.00013688442211055277,
"loss": 2.6044,
"step": 320
},
{
"epoch": 0.9035890218156228,
"grad_norm": 0.6454526782035828,
"learning_rate": 0.00013668341708542714,
"loss": 2.6938,
"step": 321
},
{
"epoch": 0.9064039408866995,
"grad_norm": 0.7545249462127686,
"learning_rate": 0.00013648241206030151,
"loss": 2.5263,
"step": 322
},
{
"epoch": 0.9092188599577762,
"grad_norm": 0.6479030251502991,
"learning_rate": 0.0001362814070351759,
"loss": 2.4625,
"step": 323
},
{
"epoch": 0.912033779028853,
"grad_norm": 0.9134926199913025,
"learning_rate": 0.00013608040201005026,
"loss": 2.7668,
"step": 324
},
{
"epoch": 0.9148486980999296,
"grad_norm": 0.6736027002334595,
"learning_rate": 0.00013587939698492463,
"loss": 2.826,
"step": 325
},
{
"epoch": 0.9176636171710063,
"grad_norm": 0.6161238551139832,
"learning_rate": 0.000135678391959799,
"loss": 2.513,
"step": 326
},
{
"epoch": 0.9204785362420831,
"grad_norm": 0.7301089763641357,
"learning_rate": 0.00013547738693467338,
"loss": 2.6374,
"step": 327
},
{
"epoch": 0.9232934553131598,
"grad_norm": 0.5782633423805237,
"learning_rate": 0.00013527638190954775,
"loss": 2.4847,
"step": 328
},
{
"epoch": 0.9261083743842364,
"grad_norm": 0.6025380492210388,
"learning_rate": 0.0001350753768844221,
"loss": 2.6272,
"step": 329
},
{
"epoch": 0.9289232934553132,
"grad_norm": 0.6242662668228149,
"learning_rate": 0.00013487437185929647,
"loss": 2.801,
"step": 330
},
{
"epoch": 0.9317382125263899,
"grad_norm": 0.7133350372314453,
"learning_rate": 0.00013467336683417087,
"loss": 2.5261,
"step": 331
},
{
"epoch": 0.9345531315974666,
"grad_norm": 0.5895963311195374,
"learning_rate": 0.00013447236180904524,
"loss": 2.7813,
"step": 332
},
{
"epoch": 0.9373680506685432,
"grad_norm": 0.7254224419593811,
"learning_rate": 0.0001342713567839196,
"loss": 2.6859,
"step": 333
},
{
"epoch": 0.94018296973962,
"grad_norm": 0.7255984544754028,
"learning_rate": 0.00013407035175879396,
"loss": 2.6328,
"step": 334
},
{
"epoch": 0.9429978888106967,
"grad_norm": 0.827979564666748,
"learning_rate": 0.00013386934673366836,
"loss": 2.7311,
"step": 335
},
{
"epoch": 0.9458128078817734,
"grad_norm": 0.6603137850761414,
"learning_rate": 0.0001336683417085427,
"loss": 2.6389,
"step": 336
},
{
"epoch": 0.9486277269528501,
"grad_norm": 0.6362401247024536,
"learning_rate": 0.00013346733668341708,
"loss": 2.7471,
"step": 337
},
{
"epoch": 0.9514426460239268,
"grad_norm": 0.7190608382225037,
"learning_rate": 0.00013326633165829146,
"loss": 2.5964,
"step": 338
},
{
"epoch": 0.9542575650950035,
"grad_norm": 0.6639814376831055,
"learning_rate": 0.00013306532663316586,
"loss": 2.6358,
"step": 339
},
{
"epoch": 0.9570724841660803,
"grad_norm": 0.6791893243789673,
"learning_rate": 0.0001328643216080402,
"loss": 2.6764,
"step": 340
},
{
"epoch": 0.9598874032371569,
"grad_norm": 0.6663180589675903,
"learning_rate": 0.00013266331658291457,
"loss": 2.8057,
"step": 341
},
{
"epoch": 0.9627023223082336,
"grad_norm": 0.5866056680679321,
"learning_rate": 0.00013246231155778895,
"loss": 2.6845,
"step": 342
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.6590510010719299,
"learning_rate": 0.00013226130653266332,
"loss": 2.4007,
"step": 343
},
{
"epoch": 0.9683321604503871,
"grad_norm": 0.6014566421508789,
"learning_rate": 0.0001320603015075377,
"loss": 2.4501,
"step": 344
},
{
"epoch": 0.9711470795214637,
"grad_norm": 0.7037169337272644,
"learning_rate": 0.00013185929648241207,
"loss": 3.0925,
"step": 345
},
{
"epoch": 0.9739619985925405,
"grad_norm": 0.5314791798591614,
"learning_rate": 0.00013165829145728644,
"loss": 2.4216,
"step": 346
},
{
"epoch": 0.9767769176636172,
"grad_norm": 0.5568397045135498,
"learning_rate": 0.0001314572864321608,
"loss": 2.4926,
"step": 347
},
{
"epoch": 0.9795918367346939,
"grad_norm": 0.728171706199646,
"learning_rate": 0.00013125628140703518,
"loss": 2.6955,
"step": 348
},
{
"epoch": 0.9824067558057706,
"grad_norm": 0.7184565663337708,
"learning_rate": 0.00013105527638190956,
"loss": 2.6989,
"step": 349
},
{
"epoch": 0.9852216748768473,
"grad_norm": 1.116127610206604,
"learning_rate": 0.00013085427135678393,
"loss": 2.5996,
"step": 350
},
{
"epoch": 0.988036593947924,
"grad_norm": 0.6320902705192566,
"learning_rate": 0.0001306532663316583,
"loss": 2.571,
"step": 351
},
{
"epoch": 0.9908515130190007,
"grad_norm": 0.6517199277877808,
"learning_rate": 0.00013045226130653268,
"loss": 2.6731,
"step": 352
},
{
"epoch": 0.9936664320900774,
"grad_norm": 0.6911283135414124,
"learning_rate": 0.00013025125628140705,
"loss": 2.8807,
"step": 353
},
{
"epoch": 0.9964813511611541,
"grad_norm": 0.7019006609916687,
"learning_rate": 0.0001300502512562814,
"loss": 2.6346,
"step": 354
},
{
"epoch": 0.9992962702322308,
"grad_norm": 0.6711909770965576,
"learning_rate": 0.00012984924623115577,
"loss": 2.7176,
"step": 355
},
{
"epoch": 1.0021111893033074,
"grad_norm": 0.6791936755180359,
"learning_rate": 0.00012964824120603017,
"loss": 2.4828,
"step": 356
},
{
"epoch": 1.0049261083743843,
"grad_norm": 0.550987720489502,
"learning_rate": 0.00012944723618090454,
"loss": 2.4396,
"step": 357
},
{
"epoch": 1.007741027445461,
"grad_norm": 0.6731054186820984,
"learning_rate": 0.0001292462311557789,
"loss": 2.7791,
"step": 358
},
{
"epoch": 1.0105559465165377,
"grad_norm": 0.5614567995071411,
"learning_rate": 0.00012904522613065326,
"loss": 2.5572,
"step": 359
},
{
"epoch": 1.0133708655876144,
"grad_norm": 0.5224441289901733,
"learning_rate": 0.00012884422110552766,
"loss": 2.3983,
"step": 360
},
{
"epoch": 1.016185784658691,
"grad_norm": 0.534264326095581,
"learning_rate": 0.000128643216080402,
"loss": 2.4682,
"step": 361
},
{
"epoch": 1.0190007037297677,
"grad_norm": 0.7560765743255615,
"learning_rate": 0.00012844221105527638,
"loss": 2.5593,
"step": 362
},
{
"epoch": 1.0218156228008444,
"grad_norm": 0.6609757542610168,
"learning_rate": 0.00012824120603015075,
"loss": 2.4428,
"step": 363
},
{
"epoch": 1.0246305418719213,
"grad_norm": 0.48449280858039856,
"learning_rate": 0.00012804020100502515,
"loss": 2.2216,
"step": 364
},
{
"epoch": 1.027445460942998,
"grad_norm": 0.6201764345169067,
"learning_rate": 0.0001278391959798995,
"loss": 2.4076,
"step": 365
},
{
"epoch": 1.0302603800140746,
"grad_norm": 0.6022098660469055,
"learning_rate": 0.00012763819095477387,
"loss": 2.3617,
"step": 366
},
{
"epoch": 1.0330752990851513,
"grad_norm": 0.5485665798187256,
"learning_rate": 0.00012743718592964824,
"loss": 2.4646,
"step": 367
},
{
"epoch": 1.035890218156228,
"grad_norm": 0.6300007700920105,
"learning_rate": 0.00012723618090452262,
"loss": 2.3845,
"step": 368
},
{
"epoch": 1.0387051372273046,
"grad_norm": 0.6588097214698792,
"learning_rate": 0.000127035175879397,
"loss": 2.3836,
"step": 369
},
{
"epoch": 1.0415200562983815,
"grad_norm": 0.6159886717796326,
"learning_rate": 0.00012683417085427136,
"loss": 2.239,
"step": 370
},
{
"epoch": 1.0443349753694582,
"grad_norm": 0.7142757177352905,
"learning_rate": 0.00012663316582914574,
"loss": 2.3208,
"step": 371
},
{
"epoch": 1.0471498944405349,
"grad_norm": 0.7620591521263123,
"learning_rate": 0.0001264321608040201,
"loss": 2.5665,
"step": 372
},
{
"epoch": 1.0499648135116115,
"grad_norm": 0.6486737728118896,
"learning_rate": 0.00012623115577889448,
"loss": 2.8276,
"step": 373
},
{
"epoch": 1.0527797325826882,
"grad_norm": 0.622787594795227,
"learning_rate": 0.00012603015075376885,
"loss": 2.4826,
"step": 374
},
{
"epoch": 1.0555946516537649,
"grad_norm": 0.6556206345558167,
"learning_rate": 0.00012582914572864323,
"loss": 2.5765,
"step": 375
},
{
"epoch": 1.0584095707248418,
"grad_norm": 0.6902799606323242,
"learning_rate": 0.0001256281407035176,
"loss": 2.3851,
"step": 376
},
{
"epoch": 1.0612244897959184,
"grad_norm": 0.6362977027893066,
"learning_rate": 0.00012542713567839197,
"loss": 2.4587,
"step": 377
},
{
"epoch": 1.064039408866995,
"grad_norm": 0.6027363538742065,
"learning_rate": 0.00012522613065326635,
"loss": 2.3945,
"step": 378
},
{
"epoch": 1.0668543279380718,
"grad_norm": 0.681010365486145,
"learning_rate": 0.0001250251256281407,
"loss": 2.247,
"step": 379
},
{
"epoch": 1.0696692470091484,
"grad_norm": 0.588394284248352,
"learning_rate": 0.00012482412060301507,
"loss": 2.3279,
"step": 380
},
{
"epoch": 1.0724841660802251,
"grad_norm": 0.6285263299942017,
"learning_rate": 0.00012462311557788947,
"loss": 2.2176,
"step": 381
},
{
"epoch": 1.0752990851513018,
"grad_norm": 0.6699137091636658,
"learning_rate": 0.00012442211055276384,
"loss": 2.4252,
"step": 382
},
{
"epoch": 1.0781140042223787,
"grad_norm": 0.7217219471931458,
"learning_rate": 0.00012422110552763818,
"loss": 2.3409,
"step": 383
},
{
"epoch": 1.0809289232934554,
"grad_norm": 0.6710893511772156,
"learning_rate": 0.00012402010050251256,
"loss": 2.4054,
"step": 384
},
{
"epoch": 1.083743842364532,
"grad_norm": 0.665313720703125,
"learning_rate": 0.00012381909547738696,
"loss": 2.4749,
"step": 385
},
{
"epoch": 1.0865587614356087,
"grad_norm": 0.655486524105072,
"learning_rate": 0.0001236180904522613,
"loss": 2.1261,
"step": 386
},
{
"epoch": 1.0893736805066854,
"grad_norm": 0.9402002692222595,
"learning_rate": 0.00012341708542713568,
"loss": 2.5344,
"step": 387
},
{
"epoch": 1.092188599577762,
"grad_norm": 0.760830819606781,
"learning_rate": 0.00012321608040201005,
"loss": 2.6215,
"step": 388
},
{
"epoch": 1.095003518648839,
"grad_norm": 0.7938470244407654,
"learning_rate": 0.00012301507537688445,
"loss": 2.3226,
"step": 389
},
{
"epoch": 1.0978184377199156,
"grad_norm": 0.789606511592865,
"learning_rate": 0.0001228140703517588,
"loss": 2.4419,
"step": 390
},
{
"epoch": 1.1006333567909923,
"grad_norm": 0.7316797375679016,
"learning_rate": 0.00012261306532663317,
"loss": 2.6489,
"step": 391
},
{
"epoch": 1.103448275862069,
"grad_norm": 0.7367439270019531,
"learning_rate": 0.00012241206030150754,
"loss": 2.2951,
"step": 392
},
{
"epoch": 1.1062631949331456,
"grad_norm": 0.735031008720398,
"learning_rate": 0.00012221105527638191,
"loss": 2.3752,
"step": 393
},
{
"epoch": 1.1090781140042223,
"grad_norm": 0.8442686796188354,
"learning_rate": 0.00012201005025125629,
"loss": 2.4054,
"step": 394
},
{
"epoch": 1.1118930330752992,
"grad_norm": 0.7112425565719604,
"learning_rate": 0.00012180904522613066,
"loss": 2.4153,
"step": 395
},
{
"epoch": 1.1147079521463759,
"grad_norm": 0.8225473165512085,
"learning_rate": 0.00012160804020100502,
"loss": 2.3042,
"step": 396
},
{
"epoch": 1.1175228712174525,
"grad_norm": 0.8238793015480042,
"learning_rate": 0.00012140703517587942,
"loss": 2.3454,
"step": 397
},
{
"epoch": 1.1203377902885292,
"grad_norm": 0.9430282711982727,
"learning_rate": 0.00012120603015075378,
"loss": 2.4815,
"step": 398
},
{
"epoch": 1.1231527093596059,
"grad_norm": 0.8851016759872437,
"learning_rate": 0.00012100502512562815,
"loss": 2.2304,
"step": 399
},
{
"epoch": 1.1259676284306825,
"grad_norm": 1.1577056646347046,
"learning_rate": 0.00012080402010050251,
"loss": 2.3881,
"step": 400
},
{
"epoch": 1.1287825475017592,
"grad_norm": 0.8923066258430481,
"learning_rate": 0.00012060301507537688,
"loss": 2.6341,
"step": 401
},
{
"epoch": 1.131597466572836,
"grad_norm": 0.8544619679450989,
"learning_rate": 0.00012040201005025127,
"loss": 2.5802,
"step": 402
},
{
"epoch": 1.1344123856439128,
"grad_norm": 0.9601594805717468,
"learning_rate": 0.00012020100502512563,
"loss": 2.2292,
"step": 403
},
{
"epoch": 1.1372273047149895,
"grad_norm": 0.9403390884399414,
"learning_rate": 0.00012,
"loss": 2.5492,
"step": 404
},
{
"epoch": 1.1400422237860661,
"grad_norm": 0.7530049681663513,
"learning_rate": 0.00011979899497487436,
"loss": 2.3684,
"step": 405
},
{
"epoch": 1.1428571428571428,
"grad_norm": 1.0320841073989868,
"learning_rate": 0.00011959798994974876,
"loss": 2.4157,
"step": 406
},
{
"epoch": 1.1456720619282197,
"grad_norm": 1.1246405839920044,
"learning_rate": 0.00011939698492462312,
"loss": 2.6815,
"step": 407
},
{
"epoch": 1.1484869809992964,
"grad_norm": 0.8500766158103943,
"learning_rate": 0.0001191959798994975,
"loss": 2.4088,
"step": 408
},
{
"epoch": 1.151301900070373,
"grad_norm": 0.7606078386306763,
"learning_rate": 0.00011899497487437185,
"loss": 2.2564,
"step": 409
},
{
"epoch": 1.1541168191414497,
"grad_norm": 0.8513486385345459,
"learning_rate": 0.00011879396984924624,
"loss": 2.5881,
"step": 410
},
{
"epoch": 1.1569317382125264,
"grad_norm": 0.7827906012535095,
"learning_rate": 0.00011859296482412061,
"loss": 2.4386,
"step": 411
},
{
"epoch": 1.159746657283603,
"grad_norm": 0.9784967303276062,
"learning_rate": 0.00011839195979899497,
"loss": 2.5552,
"step": 412
},
{
"epoch": 1.1625615763546797,
"grad_norm": 0.8472895622253418,
"learning_rate": 0.00011819095477386935,
"loss": 2.6518,
"step": 413
},
{
"epoch": 1.1653764954257566,
"grad_norm": 0.7687847018241882,
"learning_rate": 0.00011798994974874373,
"loss": 2.39,
"step": 414
},
{
"epoch": 1.1681914144968333,
"grad_norm": 0.7497126460075378,
"learning_rate": 0.0001177889447236181,
"loss": 2.2635,
"step": 415
},
{
"epoch": 1.17100633356791,
"grad_norm": 0.7597271203994751,
"learning_rate": 0.00011758793969849247,
"loss": 2.6295,
"step": 416
},
{
"epoch": 1.1738212526389866,
"grad_norm": 0.770999550819397,
"learning_rate": 0.00011738693467336684,
"loss": 2.3357,
"step": 417
},
{
"epoch": 1.1766361717100633,
"grad_norm": 0.819741427898407,
"learning_rate": 0.00011718592964824122,
"loss": 2.4455,
"step": 418
},
{
"epoch": 1.17945109078114,
"grad_norm": 0.8049472570419312,
"learning_rate": 0.00011698492462311558,
"loss": 2.184,
"step": 419
},
{
"epoch": 1.1822660098522166,
"grad_norm": 0.8897677659988403,
"learning_rate": 0.00011678391959798996,
"loss": 2.3405,
"step": 420
},
{
"epoch": 1.1850809289232935,
"grad_norm": 0.9535378217697144,
"learning_rate": 0.00011658291457286432,
"loss": 2.3352,
"step": 421
},
{
"epoch": 1.1878958479943702,
"grad_norm": 0.7934727072715759,
"learning_rate": 0.00011638190954773872,
"loss": 2.2503,
"step": 422
},
{
"epoch": 1.1907107670654469,
"grad_norm": 1.1643705368041992,
"learning_rate": 0.00011618090452261308,
"loss": 2.4985,
"step": 423
},
{
"epoch": 1.1935256861365235,
"grad_norm": 1.05571448802948,
"learning_rate": 0.00011597989949748745,
"loss": 2.5174,
"step": 424
},
{
"epoch": 1.1963406052076002,
"grad_norm": 0.8346055150032043,
"learning_rate": 0.00011577889447236181,
"loss": 2.331,
"step": 425
},
{
"epoch": 1.199155524278677,
"grad_norm": 1.067415475845337,
"learning_rate": 0.00011557788944723618,
"loss": 2.4303,
"step": 426
},
{
"epoch": 1.2019704433497538,
"grad_norm": 1.0706610679626465,
"learning_rate": 0.00011537688442211057,
"loss": 2.2276,
"step": 427
},
{
"epoch": 1.2047853624208305,
"grad_norm": 0.877740204334259,
"learning_rate": 0.00011517587939698493,
"loss": 2.3532,
"step": 428
},
{
"epoch": 1.2076002814919071,
"grad_norm": 0.9245136380195618,
"learning_rate": 0.0001149748743718593,
"loss": 2.1982,
"step": 429
},
{
"epoch": 1.2104152005629838,
"grad_norm": 0.8375447392463684,
"learning_rate": 0.00011477386934673366,
"loss": 2.2702,
"step": 430
},
{
"epoch": 1.2132301196340605,
"grad_norm": 1.0361285209655762,
"learning_rate": 0.00011457286432160806,
"loss": 2.5355,
"step": 431
},
{
"epoch": 1.2160450387051371,
"grad_norm": 0.9980331063270569,
"learning_rate": 0.00011437185929648242,
"loss": 2.0764,
"step": 432
},
{
"epoch": 1.218859957776214,
"grad_norm": 0.8354774117469788,
"learning_rate": 0.00011417085427135679,
"loss": 2.5983,
"step": 433
},
{
"epoch": 1.2216748768472907,
"grad_norm": 0.8765326738357544,
"learning_rate": 0.00011396984924623115,
"loss": 2.0808,
"step": 434
},
{
"epoch": 1.2244897959183674,
"grad_norm": 1.077864408493042,
"learning_rate": 0.00011376884422110554,
"loss": 2.2868,
"step": 435
},
{
"epoch": 1.227304714989444,
"grad_norm": 0.8155612945556641,
"learning_rate": 0.00011356783919597991,
"loss": 2.2297,
"step": 436
},
{
"epoch": 1.2301196340605207,
"grad_norm": 0.9063975811004639,
"learning_rate": 0.00011336683417085427,
"loss": 2.522,
"step": 437
},
{
"epoch": 1.2329345531315974,
"grad_norm": 0.9975262880325317,
"learning_rate": 0.00011316582914572864,
"loss": 2.4027,
"step": 438
},
{
"epoch": 1.235749472202674,
"grad_norm": 0.860905110836029,
"learning_rate": 0.00011296482412060303,
"loss": 2.4354,
"step": 439
},
{
"epoch": 1.238564391273751,
"grad_norm": 0.9583187103271484,
"learning_rate": 0.0001127638190954774,
"loss": 2.3433,
"step": 440
},
{
"epoch": 1.2413793103448276,
"grad_norm": 0.8732121586799622,
"learning_rate": 0.00011256281407035176,
"loss": 2.3921,
"step": 441
},
{
"epoch": 1.2441942294159043,
"grad_norm": 0.9089124798774719,
"learning_rate": 0.00011236180904522614,
"loss": 2.2424,
"step": 442
},
{
"epoch": 1.247009148486981,
"grad_norm": 0.8566604852676392,
"learning_rate": 0.00011216080402010052,
"loss": 2.4264,
"step": 443
},
{
"epoch": 1.2498240675580576,
"grad_norm": 0.9148624539375305,
"learning_rate": 0.00011195979899497488,
"loss": 2.3394,
"step": 444
},
{
"epoch": 1.2526389866291345,
"grad_norm": 0.8913928866386414,
"learning_rate": 0.00011175879396984925,
"loss": 2.2236,
"step": 445
},
{
"epoch": 1.255453905700211,
"grad_norm": 1.1119465827941895,
"learning_rate": 0.00011155778894472361,
"loss": 2.391,
"step": 446
},
{
"epoch": 1.2582688247712879,
"grad_norm": 1.1434952020645142,
"learning_rate": 0.00011135678391959799,
"loss": 2.4987,
"step": 447
},
{
"epoch": 1.2610837438423645,
"grad_norm": 0.9885523319244385,
"learning_rate": 0.00011115577889447237,
"loss": 2.5184,
"step": 448
},
{
"epoch": 1.2638986629134412,
"grad_norm": 0.9945192337036133,
"learning_rate": 0.00011095477386934675,
"loss": 2.4046,
"step": 449
},
{
"epoch": 1.266713581984518,
"grad_norm": 0.9107452034950256,
"learning_rate": 0.0001107537688442211,
"loss": 2.4296,
"step": 450
},
{
"epoch": 1.2695285010555946,
"grad_norm": 1.2265137434005737,
"learning_rate": 0.00011055276381909548,
"loss": 2.4336,
"step": 451
},
{
"epoch": 1.2723434201266715,
"grad_norm": 0.907394289970398,
"learning_rate": 0.00011035175879396986,
"loss": 2.4008,
"step": 452
},
{
"epoch": 1.2751583391977481,
"grad_norm": 0.884708821773529,
"learning_rate": 0.00011015075376884422,
"loss": 2.5134,
"step": 453
},
{
"epoch": 1.2779732582688248,
"grad_norm": 0.8295673727989197,
"learning_rate": 0.0001099497487437186,
"loss": 2.5117,
"step": 454
},
{
"epoch": 1.2807881773399015,
"grad_norm": 1.0812764167785645,
"learning_rate": 0.00010974874371859296,
"loss": 2.372,
"step": 455
},
{
"epoch": 1.2836030964109781,
"grad_norm": 1.0535778999328613,
"learning_rate": 0.00010954773869346736,
"loss": 2.5114,
"step": 456
},
{
"epoch": 1.2864180154820548,
"grad_norm": 1.1005867719650269,
"learning_rate": 0.00010934673366834172,
"loss": 2.4269,
"step": 457
},
{
"epoch": 1.2892329345531315,
"grad_norm": 0.813443124294281,
"learning_rate": 0.00010914572864321609,
"loss": 2.3226,
"step": 458
},
{
"epoch": 1.2920478536242084,
"grad_norm": 0.8614223599433899,
"learning_rate": 0.00010894472361809045,
"loss": 2.3945,
"step": 459
},
{
"epoch": 1.294862772695285,
"grad_norm": 0.9305881857872009,
"learning_rate": 0.00010874371859296483,
"loss": 2.4106,
"step": 460
},
{
"epoch": 1.2976776917663617,
"grad_norm": 0.7936707139015198,
"learning_rate": 0.00010854271356783921,
"loss": 2.2523,
"step": 461
},
{
"epoch": 1.3004926108374384,
"grad_norm": 0.9864185452461243,
"learning_rate": 0.00010834170854271357,
"loss": 2.4317,
"step": 462
},
{
"epoch": 1.303307529908515,
"grad_norm": 0.8099750876426697,
"learning_rate": 0.00010814070351758794,
"loss": 2.5428,
"step": 463
},
{
"epoch": 1.306122448979592,
"grad_norm": 0.8694155216217041,
"learning_rate": 0.00010793969849246233,
"loss": 2.4084,
"step": 464
},
{
"epoch": 1.3089373680506686,
"grad_norm": 0.963947057723999,
"learning_rate": 0.0001077386934673367,
"loss": 2.3802,
"step": 465
},
{
"epoch": 1.3117522871217453,
"grad_norm": 0.9907119274139404,
"learning_rate": 0.00010753768844221106,
"loss": 2.3028,
"step": 466
},
{
"epoch": 1.314567206192822,
"grad_norm": 0.8978596329689026,
"learning_rate": 0.00010733668341708543,
"loss": 2.129,
"step": 467
},
{
"epoch": 1.3173821252638986,
"grad_norm": 1.0621075630187988,
"learning_rate": 0.00010713567839195982,
"loss": 2.4765,
"step": 468
},
{
"epoch": 1.3201970443349753,
"grad_norm": 1.0847358703613281,
"learning_rate": 0.00010693467336683418,
"loss": 2.4611,
"step": 469
},
{
"epoch": 1.323011963406052,
"grad_norm": 0.8706623315811157,
"learning_rate": 0.00010673366834170855,
"loss": 2.5458,
"step": 470
},
{
"epoch": 1.3258268824771289,
"grad_norm": 1.0084209442138672,
"learning_rate": 0.00010653266331658291,
"loss": 2.4968,
"step": 471
},
{
"epoch": 1.3286418015482055,
"grad_norm": 0.8770229816436768,
"learning_rate": 0.00010633165829145728,
"loss": 2.3268,
"step": 472
},
{
"epoch": 1.3314567206192822,
"grad_norm": 0.9652953743934631,
"learning_rate": 0.00010613065326633167,
"loss": 2.3758,
"step": 473
},
{
"epoch": 1.334271639690359,
"grad_norm": 0.8194919228553772,
"learning_rate": 0.00010592964824120604,
"loss": 2.4732,
"step": 474
},
{
"epoch": 1.3370865587614356,
"grad_norm": 3.534748077392578,
"learning_rate": 0.0001057286432160804,
"loss": 2.3725,
"step": 475
},
{
"epoch": 1.3399014778325122,
"grad_norm": 0.9962548017501831,
"learning_rate": 0.00010552763819095478,
"loss": 2.3963,
"step": 476
},
{
"epoch": 1.342716396903589,
"grad_norm": 0.794152021408081,
"learning_rate": 0.00010532663316582916,
"loss": 2.2899,
"step": 477
},
{
"epoch": 1.3455313159746658,
"grad_norm": 1.100648283958435,
"learning_rate": 0.00010512562814070352,
"loss": 2.5847,
"step": 478
},
{
"epoch": 1.3483462350457425,
"grad_norm": 0.8269829154014587,
"learning_rate": 0.0001049246231155779,
"loss": 2.4694,
"step": 479
},
{
"epoch": 1.3511611541168191,
"grad_norm": 0.915529727935791,
"learning_rate": 0.00010472361809045225,
"loss": 2.2721,
"step": 480
},
{
"epoch": 1.3539760731878958,
"grad_norm": 0.8491760492324829,
"learning_rate": 0.00010452261306532664,
"loss": 2.3008,
"step": 481
},
{
"epoch": 1.3567909922589725,
"grad_norm": 0.8877702355384827,
"learning_rate": 0.00010432160804020101,
"loss": 2.33,
"step": 482
},
{
"epoch": 1.3596059113300494,
"grad_norm": 0.86586993932724,
"learning_rate": 0.00010412060301507539,
"loss": 2.0962,
"step": 483
},
{
"epoch": 1.362420830401126,
"grad_norm": 0.8984941244125366,
"learning_rate": 0.00010391959798994975,
"loss": 2.1901,
"step": 484
},
{
"epoch": 1.3652357494722027,
"grad_norm": 0.8369758129119873,
"learning_rate": 0.00010371859296482413,
"loss": 2.4045,
"step": 485
},
{
"epoch": 1.3680506685432794,
"grad_norm": 0.7900081276893616,
"learning_rate": 0.0001035175879396985,
"loss": 2.1448,
"step": 486
},
{
"epoch": 1.370865587614356,
"grad_norm": 0.9296205043792725,
"learning_rate": 0.00010331658291457286,
"loss": 2.2958,
"step": 487
},
{
"epoch": 1.3736805066854327,
"grad_norm": 1.0592749118804932,
"learning_rate": 0.00010311557788944724,
"loss": 2.524,
"step": 488
},
{
"epoch": 1.3764954257565094,
"grad_norm": 0.7983985543251038,
"learning_rate": 0.00010291457286432162,
"loss": 2.4329,
"step": 489
},
{
"epoch": 1.3793103448275863,
"grad_norm": 1.1377589702606201,
"learning_rate": 0.00010271356783919598,
"loss": 2.2845,
"step": 490
},
{
"epoch": 1.382125263898663,
"grad_norm": 1.1031099557876587,
"learning_rate": 0.00010251256281407036,
"loss": 2.3531,
"step": 491
},
{
"epoch": 1.3849401829697396,
"grad_norm": 0.9376154541969299,
"learning_rate": 0.00010231155778894473,
"loss": 2.1406,
"step": 492
},
{
"epoch": 1.3877551020408163,
"grad_norm": 1.0728362798690796,
"learning_rate": 0.00010211055276381909,
"loss": 2.4291,
"step": 493
},
{
"epoch": 1.390570021111893,
"grad_norm": 1.021877408027649,
"learning_rate": 0.00010190954773869348,
"loss": 2.4566,
"step": 494
},
{
"epoch": 1.3933849401829699,
"grad_norm": 1.1455014944076538,
"learning_rate": 0.00010170854271356785,
"loss": 2.3074,
"step": 495
},
{
"epoch": 1.3961998592540463,
"grad_norm": 0.8512632846832275,
"learning_rate": 0.00010150753768844221,
"loss": 2.4004,
"step": 496
},
{
"epoch": 1.3990147783251232,
"grad_norm": 0.9220101237297058,
"learning_rate": 0.00010130653266331658,
"loss": 2.4456,
"step": 497
},
{
"epoch": 1.4018296973962,
"grad_norm": 1.0779199600219727,
"learning_rate": 0.00010110552763819097,
"loss": 2.4918,
"step": 498
},
{
"epoch": 1.4046446164672766,
"grad_norm": 1.0237290859222412,
"learning_rate": 0.00010090452261306533,
"loss": 2.3606,
"step": 499
},
{
"epoch": 1.4074595355383532,
"grad_norm": 1.2366681098937988,
"learning_rate": 0.0001007035175879397,
"loss": 2.468,
"step": 500
},
{
"epoch": 1.41027445460943,
"grad_norm": 0.9772239923477173,
"learning_rate": 0.00010050251256281407,
"loss": 2.2868,
"step": 501
},
{
"epoch": 1.4130893736805068,
"grad_norm": 0.9963237643241882,
"learning_rate": 0.00010030150753768846,
"loss": 2.3794,
"step": 502
},
{
"epoch": 1.4159042927515835,
"grad_norm": 0.8932761549949646,
"learning_rate": 0.00010010050251256282,
"loss": 2.4493,
"step": 503
},
{
"epoch": 1.4187192118226601,
"grad_norm": 0.8121969103813171,
"learning_rate": 9.989949748743719e-05,
"loss": 2.4045,
"step": 504
},
{
"epoch": 1.4215341308937368,
"grad_norm": 0.9738163948059082,
"learning_rate": 9.969849246231156e-05,
"loss": 2.3725,
"step": 505
},
{
"epoch": 1.4243490499648135,
"grad_norm": 0.8812170028686523,
"learning_rate": 9.949748743718594e-05,
"loss": 2.2143,
"step": 506
},
{
"epoch": 1.4271639690358902,
"grad_norm": 0.9500517249107361,
"learning_rate": 9.929648241206031e-05,
"loss": 2.4194,
"step": 507
},
{
"epoch": 1.4299788881069668,
"grad_norm": 0.8924652338027954,
"learning_rate": 9.909547738693468e-05,
"loss": 2.4061,
"step": 508
},
{
"epoch": 1.4327938071780437,
"grad_norm": 0.9255656003952026,
"learning_rate": 9.889447236180906e-05,
"loss": 2.4458,
"step": 509
},
{
"epoch": 1.4356087262491204,
"grad_norm": 0.9696526527404785,
"learning_rate": 9.869346733668342e-05,
"loss": 2.3285,
"step": 510
},
{
"epoch": 1.438423645320197,
"grad_norm": 0.9249640107154846,
"learning_rate": 9.84924623115578e-05,
"loss": 2.3348,
"step": 511
},
{
"epoch": 1.4412385643912737,
"grad_norm": 0.8589572906494141,
"learning_rate": 9.829145728643216e-05,
"loss": 2.3224,
"step": 512
},
{
"epoch": 1.4440534834623504,
"grad_norm": 0.9638547301292419,
"learning_rate": 9.809045226130655e-05,
"loss": 2.3172,
"step": 513
},
{
"epoch": 1.4468684025334273,
"grad_norm": 0.9466349482536316,
"learning_rate": 9.788944723618091e-05,
"loss": 2.4924,
"step": 514
},
{
"epoch": 1.4496833216045037,
"grad_norm": 0.891727089881897,
"learning_rate": 9.768844221105528e-05,
"loss": 2.446,
"step": 515
},
{
"epoch": 1.4524982406755806,
"grad_norm": 0.9032166600227356,
"learning_rate": 9.748743718592965e-05,
"loss": 2.3733,
"step": 516
},
{
"epoch": 1.4553131597466573,
"grad_norm": 0.8339729905128479,
"learning_rate": 9.728643216080403e-05,
"loss": 2.0429,
"step": 517
},
{
"epoch": 1.458128078817734,
"grad_norm": 0.8902753591537476,
"learning_rate": 9.70854271356784e-05,
"loss": 2.4414,
"step": 518
},
{
"epoch": 1.4609429978888107,
"grad_norm": 3.1321892738342285,
"learning_rate": 9.688442211055276e-05,
"loss": 2.1431,
"step": 519
},
{
"epoch": 1.4637579169598873,
"grad_norm": 0.8980495929718018,
"learning_rate": 9.668341708542715e-05,
"loss": 2.646,
"step": 520
},
{
"epoch": 1.4665728360309642,
"grad_norm": 0.9685273170471191,
"learning_rate": 9.64824120603015e-05,
"loss": 2.3513,
"step": 521
},
{
"epoch": 1.469387755102041,
"grad_norm": 0.9794145822525024,
"learning_rate": 9.628140703517589e-05,
"loss": 2.477,
"step": 522
},
{
"epoch": 1.4722026741731176,
"grad_norm": 1.0298691987991333,
"learning_rate": 9.608040201005025e-05,
"loss": 2.3071,
"step": 523
},
{
"epoch": 1.4750175932441942,
"grad_norm": 1.007864236831665,
"learning_rate": 9.587939698492462e-05,
"loss": 2.443,
"step": 524
},
{
"epoch": 1.477832512315271,
"grad_norm": 0.8953837752342224,
"learning_rate": 9.5678391959799e-05,
"loss": 2.645,
"step": 525
},
{
"epoch": 1.4806474313863476,
"grad_norm": 0.9239638447761536,
"learning_rate": 9.547738693467337e-05,
"loss": 2.4693,
"step": 526
},
{
"epoch": 1.4834623504574243,
"grad_norm": 0.9644606709480286,
"learning_rate": 9.527638190954774e-05,
"loss": 2.427,
"step": 527
},
{
"epoch": 1.4862772695285011,
"grad_norm": 0.9406245946884155,
"learning_rate": 9.507537688442212e-05,
"loss": 2.4382,
"step": 528
},
{
"epoch": 1.4890921885995778,
"grad_norm": 1.0155800580978394,
"learning_rate": 9.487437185929649e-05,
"loss": 2.4304,
"step": 529
},
{
"epoch": 1.4919071076706545,
"grad_norm": 0.8856943845748901,
"learning_rate": 9.467336683417086e-05,
"loss": 2.4116,
"step": 530
},
{
"epoch": 1.4947220267417312,
"grad_norm": 0.8708421587944031,
"learning_rate": 9.447236180904523e-05,
"loss": 2.4999,
"step": 531
},
{
"epoch": 1.4975369458128078,
"grad_norm": 0.9287678599357605,
"learning_rate": 9.427135678391961e-05,
"loss": 2.3666,
"step": 532
},
{
"epoch": 1.5003518648838847,
"grad_norm": 0.9635983109474182,
"learning_rate": 9.407035175879397e-05,
"loss": 2.545,
"step": 533
},
{
"epoch": 1.5031667839549612,
"grad_norm": 0.8631216287612915,
"learning_rate": 9.386934673366835e-05,
"loss": 2.3323,
"step": 534
},
{
"epoch": 1.505981703026038,
"grad_norm": 0.9372367858886719,
"learning_rate": 9.366834170854271e-05,
"loss": 2.5233,
"step": 535
},
{
"epoch": 1.5087966220971147,
"grad_norm": 0.8693691492080688,
"learning_rate": 9.34673366834171e-05,
"loss": 2.3968,
"step": 536
},
{
"epoch": 1.5116115411681914,
"grad_norm": 0.911888062953949,
"learning_rate": 9.326633165829146e-05,
"loss": 2.5019,
"step": 537
},
{
"epoch": 1.514426460239268,
"grad_norm": 0.9012334942817688,
"learning_rate": 9.306532663316585e-05,
"loss": 2.4697,
"step": 538
},
{
"epoch": 1.5172413793103448,
"grad_norm": 0.9713804721832275,
"learning_rate": 9.28643216080402e-05,
"loss": 2.4162,
"step": 539
},
{
"epoch": 1.5200562983814216,
"grad_norm": 0.9272815585136414,
"learning_rate": 9.266331658291458e-05,
"loss": 2.4306,
"step": 540
},
{
"epoch": 1.522871217452498,
"grad_norm": 0.9575127959251404,
"learning_rate": 9.246231155778895e-05,
"loss": 2.2645,
"step": 541
},
{
"epoch": 1.525686136523575,
"grad_norm": 1.0728868246078491,
"learning_rate": 9.226130653266331e-05,
"loss": 2.4701,
"step": 542
},
{
"epoch": 1.5285010555946517,
"grad_norm": 0.8400806784629822,
"learning_rate": 9.20603015075377e-05,
"loss": 2.1602,
"step": 543
},
{
"epoch": 1.5313159746657283,
"grad_norm": 0.9699164032936096,
"learning_rate": 9.185929648241206e-05,
"loss": 2.3135,
"step": 544
},
{
"epoch": 1.5341308937368052,
"grad_norm": 1.0054633617401123,
"learning_rate": 9.165829145728644e-05,
"loss": 2.2226,
"step": 545
},
{
"epoch": 1.5369458128078817,
"grad_norm": 0.9745274782180786,
"learning_rate": 9.14572864321608e-05,
"loss": 2.3635,
"step": 546
},
{
"epoch": 1.5397607318789586,
"grad_norm": 0.8937272429466248,
"learning_rate": 9.125628140703519e-05,
"loss": 2.1517,
"step": 547
},
{
"epoch": 1.5425756509500352,
"grad_norm": 1.0121883153915405,
"learning_rate": 9.105527638190955e-05,
"loss": 2.3947,
"step": 548
},
{
"epoch": 1.545390570021112,
"grad_norm": 0.9827173352241516,
"learning_rate": 9.085427135678392e-05,
"loss": 2.3363,
"step": 549
},
{
"epoch": 1.5482054890921886,
"grad_norm": 0.9372025728225708,
"learning_rate": 9.06532663316583e-05,
"loss": 2.4856,
"step": 550
},
{
"epoch": 1.5510204081632653,
"grad_norm": 1.1509451866149902,
"learning_rate": 9.045226130653267e-05,
"loss": 2.444,
"step": 551
},
{
"epoch": 1.5538353272343421,
"grad_norm": 1.0390241146087646,
"learning_rate": 9.025125628140704e-05,
"loss": 2.5131,
"step": 552
},
{
"epoch": 1.5566502463054186,
"grad_norm": 0.9539273381233215,
"learning_rate": 9.005025125628141e-05,
"loss": 2.3651,
"step": 553
},
{
"epoch": 1.5594651653764955,
"grad_norm": 1.0755736827850342,
"learning_rate": 8.984924623115579e-05,
"loss": 2.5255,
"step": 554
},
{
"epoch": 1.5622800844475722,
"grad_norm": 0.850516140460968,
"learning_rate": 8.964824120603016e-05,
"loss": 2.3374,
"step": 555
},
{
"epoch": 1.5650950035186488,
"grad_norm": 0.8225517868995667,
"learning_rate": 8.944723618090453e-05,
"loss": 2.2753,
"step": 556
},
{
"epoch": 1.5679099225897255,
"grad_norm": 0.9345348477363586,
"learning_rate": 8.92462311557789e-05,
"loss": 2.3177,
"step": 557
},
{
"epoch": 1.5707248416608022,
"grad_norm": 1.1513749361038208,
"learning_rate": 8.904522613065326e-05,
"loss": 2.5246,
"step": 558
},
{
"epoch": 1.573539760731879,
"grad_norm": 0.8601298332214355,
"learning_rate": 8.884422110552765e-05,
"loss": 2.2688,
"step": 559
},
{
"epoch": 1.5763546798029555,
"grad_norm": 0.9165076613426208,
"learning_rate": 8.864321608040201e-05,
"loss": 2.3773,
"step": 560
},
{
"epoch": 1.5791695988740324,
"grad_norm": 1.0467596054077148,
"learning_rate": 8.84422110552764e-05,
"loss": 2.3887,
"step": 561
},
{
"epoch": 1.581984517945109,
"grad_norm": 0.994055449962616,
"learning_rate": 8.824120603015076e-05,
"loss": 2.5547,
"step": 562
},
{
"epoch": 1.5847994370161858,
"grad_norm": 1.100698709487915,
"learning_rate": 8.804020100502513e-05,
"loss": 2.3517,
"step": 563
},
{
"epoch": 1.5876143560872626,
"grad_norm": 0.9837064743041992,
"learning_rate": 8.78391959798995e-05,
"loss": 2.3602,
"step": 564
},
{
"epoch": 1.590429275158339,
"grad_norm": 0.9642098546028137,
"learning_rate": 8.763819095477387e-05,
"loss": 2.3596,
"step": 565
},
{
"epoch": 1.593244194229416,
"grad_norm": 0.930444061756134,
"learning_rate": 8.743718592964825e-05,
"loss": 2.4475,
"step": 566
},
{
"epoch": 1.5960591133004927,
"grad_norm": 3.837113618850708,
"learning_rate": 8.723618090452261e-05,
"loss": 2.3049,
"step": 567
},
{
"epoch": 1.5988740323715693,
"grad_norm": 0.9022939205169678,
"learning_rate": 8.7035175879397e-05,
"loss": 2.5377,
"step": 568
},
{
"epoch": 1.601688951442646,
"grad_norm": 0.9081845879554749,
"learning_rate": 8.683417085427135e-05,
"loss": 2.2669,
"step": 569
},
{
"epoch": 1.6045038705137227,
"grad_norm": 1.0544410943984985,
"learning_rate": 8.663316582914574e-05,
"loss": 2.5433,
"step": 570
},
{
"epoch": 1.6073187895847996,
"grad_norm": 1.0048531293869019,
"learning_rate": 8.64321608040201e-05,
"loss": 2.3306,
"step": 571
},
{
"epoch": 1.610133708655876,
"grad_norm": 0.9438222646713257,
"learning_rate": 8.623115577889449e-05,
"loss": 2.3569,
"step": 572
},
{
"epoch": 1.612948627726953,
"grad_norm": 0.8773800134658813,
"learning_rate": 8.603015075376884e-05,
"loss": 2.2443,
"step": 573
},
{
"epoch": 1.6157635467980296,
"grad_norm": 0.8797910809516907,
"learning_rate": 8.582914572864322e-05,
"loss": 2.3595,
"step": 574
},
{
"epoch": 1.6185784658691063,
"grad_norm": 1.003212809562683,
"learning_rate": 8.562814070351759e-05,
"loss": 2.3623,
"step": 575
},
{
"epoch": 1.621393384940183,
"grad_norm": 0.97022944688797,
"learning_rate": 8.542713567839196e-05,
"loss": 2.3163,
"step": 576
},
{
"epoch": 1.6242083040112596,
"grad_norm": 1.0100817680358887,
"learning_rate": 8.522613065326634e-05,
"loss": 2.4039,
"step": 577
},
{
"epoch": 1.6270232230823365,
"grad_norm": 0.9104019403457642,
"learning_rate": 8.502512562814071e-05,
"loss": 2.2857,
"step": 578
},
{
"epoch": 1.629838142153413,
"grad_norm": 0.8757676482200623,
"learning_rate": 8.482412060301508e-05,
"loss": 1.8537,
"step": 579
},
{
"epoch": 1.6326530612244898,
"grad_norm": 1.2689822912216187,
"learning_rate": 8.462311557788946e-05,
"loss": 2.375,
"step": 580
},
{
"epoch": 1.6354679802955665,
"grad_norm": 0.9648078680038452,
"learning_rate": 8.442211055276383e-05,
"loss": 2.4079,
"step": 581
},
{
"epoch": 1.6382828993666432,
"grad_norm": 0.8941888213157654,
"learning_rate": 8.42211055276382e-05,
"loss": 2.3487,
"step": 582
},
{
"epoch": 1.64109781843772,
"grad_norm": 1.1510968208312988,
"learning_rate": 8.402010050251256e-05,
"loss": 2.2717,
"step": 583
},
{
"epoch": 1.6439127375087965,
"grad_norm": 0.8997256755828857,
"learning_rate": 8.381909547738695e-05,
"loss": 2.2837,
"step": 584
},
{
"epoch": 1.6467276565798734,
"grad_norm": 0.9518352746963501,
"learning_rate": 8.36180904522613e-05,
"loss": 2.4866,
"step": 585
},
{
"epoch": 1.64954257565095,
"grad_norm": 0.9324397444725037,
"learning_rate": 8.341708542713568e-05,
"loss": 2.3405,
"step": 586
},
{
"epoch": 1.6523574947220268,
"grad_norm": 0.9339852929115295,
"learning_rate": 8.321608040201005e-05,
"loss": 2.2778,
"step": 587
},
{
"epoch": 1.6551724137931034,
"grad_norm": 1.0167529582977295,
"learning_rate": 8.301507537688443e-05,
"loss": 2.3714,
"step": 588
},
{
"epoch": 1.65798733286418,
"grad_norm": 1.1903690099716187,
"learning_rate": 8.28140703517588e-05,
"loss": 2.6944,
"step": 589
},
{
"epoch": 1.660802251935257,
"grad_norm": 0.9818819165229797,
"learning_rate": 8.261306532663317e-05,
"loss": 2.5714,
"step": 590
},
{
"epoch": 1.6636171710063334,
"grad_norm": 0.9528331160545349,
"learning_rate": 8.241206030150754e-05,
"loss": 2.4357,
"step": 591
},
{
"epoch": 1.6664320900774103,
"grad_norm": 1.125532865524292,
"learning_rate": 8.22110552763819e-05,
"loss": 2.7857,
"step": 592
},
{
"epoch": 1.669247009148487,
"grad_norm": 1.0312976837158203,
"learning_rate": 8.201005025125629e-05,
"loss": 2.5711,
"step": 593
},
{
"epoch": 1.6720619282195637,
"grad_norm": 1.0230183601379395,
"learning_rate": 8.180904522613065e-05,
"loss": 2.449,
"step": 594
},
{
"epoch": 1.6748768472906403,
"grad_norm": 1.0071231126785278,
"learning_rate": 8.160804020100504e-05,
"loss": 2.4473,
"step": 595
},
{
"epoch": 1.677691766361717,
"grad_norm": 0.8198001980781555,
"learning_rate": 8.14070351758794e-05,
"loss": 2.2743,
"step": 596
},
{
"epoch": 1.680506685432794,
"grad_norm": 0.9489749670028687,
"learning_rate": 8.120603015075378e-05,
"loss": 2.3807,
"step": 597
},
{
"epoch": 1.6833216045038704,
"grad_norm": 0.9406233429908752,
"learning_rate": 8.100502512562814e-05,
"loss": 2.3478,
"step": 598
},
{
"epoch": 1.6861365235749473,
"grad_norm": 1.1207877397537231,
"learning_rate": 8.080402010050251e-05,
"loss": 2.4251,
"step": 599
},
{
"epoch": 1.688951442646024,
"grad_norm": 0.8841767311096191,
"learning_rate": 8.060301507537689e-05,
"loss": 2.4555,
"step": 600
},
{
"epoch": 1.6917663617171006,
"grad_norm": 1.0105196237564087,
"learning_rate": 8.040201005025126e-05,
"loss": 2.4965,
"step": 601
},
{
"epoch": 1.6945812807881775,
"grad_norm": 1.4616045951843262,
"learning_rate": 8.020100502512563e-05,
"loss": 2.1856,
"step": 602
},
{
"epoch": 1.697396199859254,
"grad_norm": 0.8796388506889343,
"learning_rate": 8e-05,
"loss": 2.267,
"step": 603
},
{
"epoch": 1.7002111189303308,
"grad_norm": 0.9459576606750488,
"learning_rate": 7.979899497487438e-05,
"loss": 2.3772,
"step": 604
},
{
"epoch": 1.7030260380014075,
"grad_norm": 0.9645008444786072,
"learning_rate": 7.959798994974875e-05,
"loss": 2.4959,
"step": 605
},
{
"epoch": 1.7058409570724842,
"grad_norm": 1.002943515777588,
"learning_rate": 7.939698492462313e-05,
"loss": 2.2909,
"step": 606
},
{
"epoch": 1.7086558761435608,
"grad_norm": 0.9787498116493225,
"learning_rate": 7.91959798994975e-05,
"loss": 2.4482,
"step": 607
},
{
"epoch": 1.7114707952146375,
"grad_norm": 0.9761974811553955,
"learning_rate": 7.899497487437186e-05,
"loss": 2.358,
"step": 608
},
{
"epoch": 1.7142857142857144,
"grad_norm": 1.0572192668914795,
"learning_rate": 7.879396984924623e-05,
"loss": 2.3769,
"step": 609
},
{
"epoch": 1.7171006333567909,
"grad_norm": 1.1789201498031616,
"learning_rate": 7.85929648241206e-05,
"loss": 2.4841,
"step": 610
},
{
"epoch": 1.7199155524278678,
"grad_norm": 1.2059478759765625,
"learning_rate": 7.839195979899498e-05,
"loss": 2.3738,
"step": 611
},
{
"epoch": 1.7227304714989444,
"grad_norm": 0.884793758392334,
"learning_rate": 7.819095477386935e-05,
"loss": 2.2696,
"step": 612
},
{
"epoch": 1.725545390570021,
"grad_norm": 1.1065027713775635,
"learning_rate": 7.798994974874372e-05,
"loss": 2.2802,
"step": 613
},
{
"epoch": 1.7283603096410978,
"grad_norm": 0.9025463461875916,
"learning_rate": 7.77889447236181e-05,
"loss": 2.2835,
"step": 614
},
{
"epoch": 1.7311752287121744,
"grad_norm": 0.939608633518219,
"learning_rate": 7.758793969849247e-05,
"loss": 2.2813,
"step": 615
},
{
"epoch": 1.7339901477832513,
"grad_norm": 1.2778400182724,
"learning_rate": 7.738693467336684e-05,
"loss": 2.5467,
"step": 616
},
{
"epoch": 1.7368050668543278,
"grad_norm": 1.1586782932281494,
"learning_rate": 7.71859296482412e-05,
"loss": 2.2648,
"step": 617
},
{
"epoch": 1.7396199859254047,
"grad_norm": 0.9858127236366272,
"learning_rate": 7.698492462311559e-05,
"loss": 2.053,
"step": 618
},
{
"epoch": 1.7424349049964813,
"grad_norm": 0.966553807258606,
"learning_rate": 7.678391959798995e-05,
"loss": 2.2932,
"step": 619
},
{
"epoch": 1.745249824067558,
"grad_norm": 1.2511391639709473,
"learning_rate": 7.658291457286433e-05,
"loss": 2.4427,
"step": 620
},
{
"epoch": 1.748064743138635,
"grad_norm": 0.9545679092407227,
"learning_rate": 7.638190954773869e-05,
"loss": 2.5727,
"step": 621
},
{
"epoch": 1.7508796622097114,
"grad_norm": 0.9427103400230408,
"learning_rate": 7.618090452261307e-05,
"loss": 2.213,
"step": 622
},
{
"epoch": 1.7536945812807883,
"grad_norm": 0.9781317710876465,
"learning_rate": 7.597989949748744e-05,
"loss": 2.46,
"step": 623
},
{
"epoch": 1.756509500351865,
"grad_norm": 0.9984252452850342,
"learning_rate": 7.577889447236181e-05,
"loss": 2.5281,
"step": 624
},
{
"epoch": 1.7593244194229416,
"grad_norm": 1.1169133186340332,
"learning_rate": 7.557788944723618e-05,
"loss": 2.036,
"step": 625
},
{
"epoch": 1.7621393384940183,
"grad_norm": 0.9950047731399536,
"learning_rate": 7.537688442211056e-05,
"loss": 2.3513,
"step": 626
},
{
"epoch": 1.764954257565095,
"grad_norm": 1.1179485321044922,
"learning_rate": 7.517587939698493e-05,
"loss": 2.3691,
"step": 627
},
{
"epoch": 1.7677691766361718,
"grad_norm": 1.3760029077529907,
"learning_rate": 7.49748743718593e-05,
"loss": 2.5204,
"step": 628
},
{
"epoch": 1.7705840957072483,
"grad_norm": 1.0102930068969727,
"learning_rate": 7.477386934673368e-05,
"loss": 2.4864,
"step": 629
},
{
"epoch": 1.7733990147783252,
"grad_norm": 0.9945108890533447,
"learning_rate": 7.457286432160805e-05,
"loss": 2.4329,
"step": 630
},
{
"epoch": 1.7762139338494018,
"grad_norm": 0.9327785968780518,
"learning_rate": 7.437185929648241e-05,
"loss": 2.4472,
"step": 631
},
{
"epoch": 1.7790288529204785,
"grad_norm": 1.280433177947998,
"learning_rate": 7.417085427135678e-05,
"loss": 2.5645,
"step": 632
},
{
"epoch": 1.7818437719915552,
"grad_norm": 0.9169235229492188,
"learning_rate": 7.396984924623115e-05,
"loss": 2.3657,
"step": 633
},
{
"epoch": 1.7846586910626319,
"grad_norm": 0.8970544338226318,
"learning_rate": 7.376884422110553e-05,
"loss": 2.4142,
"step": 634
},
{
"epoch": 1.7874736101337088,
"grad_norm": 0.9948874711990356,
"learning_rate": 7.35678391959799e-05,
"loss": 2.28,
"step": 635
},
{
"epoch": 1.7902885292047852,
"grad_norm": 1.002539873123169,
"learning_rate": 7.336683417085427e-05,
"loss": 2.5045,
"step": 636
},
{
"epoch": 1.793103448275862,
"grad_norm": 0.9109549522399902,
"learning_rate": 7.316582914572865e-05,
"loss": 2.5396,
"step": 637
},
{
"epoch": 1.7959183673469388,
"grad_norm": 1.1350212097167969,
"learning_rate": 7.296482412060302e-05,
"loss": 2.435,
"step": 638
},
{
"epoch": 1.7987332864180154,
"grad_norm": 1.1271899938583374,
"learning_rate": 7.276381909547739e-05,
"loss": 2.366,
"step": 639
},
{
"epoch": 1.8015482054890923,
"grad_norm": 0.9859978556632996,
"learning_rate": 7.256281407035177e-05,
"loss": 2.4066,
"step": 640
},
{
"epoch": 1.8043631245601688,
"grad_norm": 1.0322918891906738,
"learning_rate": 7.236180904522614e-05,
"loss": 2.5016,
"step": 641
},
{
"epoch": 1.8071780436312457,
"grad_norm": 1.045304298400879,
"learning_rate": 7.21608040201005e-05,
"loss": 2.332,
"step": 642
},
{
"epoch": 1.8099929627023223,
"grad_norm": 1.0578879117965698,
"learning_rate": 7.195979899497488e-05,
"loss": 2.3446,
"step": 643
},
{
"epoch": 1.812807881773399,
"grad_norm": 1.385239839553833,
"learning_rate": 7.175879396984924e-05,
"loss": 2.1476,
"step": 644
},
{
"epoch": 1.8156228008444757,
"grad_norm": 0.9029603600502014,
"learning_rate": 7.155778894472363e-05,
"loss": 2.2793,
"step": 645
},
{
"epoch": 1.8184377199155524,
"grad_norm": 0.9613030552864075,
"learning_rate": 7.135678391959799e-05,
"loss": 2.5449,
"step": 646
},
{
"epoch": 1.8212526389866293,
"grad_norm": 0.9739305377006531,
"learning_rate": 7.115577889447236e-05,
"loss": 2.3309,
"step": 647
},
{
"epoch": 1.8240675580577057,
"grad_norm": 1.0117180347442627,
"learning_rate": 7.095477386934674e-05,
"loss": 2.4027,
"step": 648
},
{
"epoch": 1.8268824771287826,
"grad_norm": 1.2691757678985596,
"learning_rate": 7.075376884422111e-05,
"loss": 2.5499,
"step": 649
},
{
"epoch": 1.8296973961998593,
"grad_norm": 0.9493529200553894,
"learning_rate": 7.055276381909548e-05,
"loss": 2.2126,
"step": 650
},
{
"epoch": 1.832512315270936,
"grad_norm": 1.0153197050094604,
"learning_rate": 7.035175879396985e-05,
"loss": 2.2784,
"step": 651
},
{
"epoch": 1.8353272343420126,
"grad_norm": 0.9323593974113464,
"learning_rate": 7.015075376884423e-05,
"loss": 2.2333,
"step": 652
},
{
"epoch": 1.8381421534130893,
"grad_norm": 1.0737582445144653,
"learning_rate": 6.99497487437186e-05,
"loss": 2.4598,
"step": 653
},
{
"epoch": 1.8409570724841662,
"grad_norm": 0.9445222616195679,
"learning_rate": 6.974874371859297e-05,
"loss": 2.2218,
"step": 654
},
{
"epoch": 1.8437719915552426,
"grad_norm": 1.043349027633667,
"learning_rate": 6.954773869346733e-05,
"loss": 2.5126,
"step": 655
},
{
"epoch": 1.8465869106263195,
"grad_norm": 0.9958374500274658,
"learning_rate": 6.93467336683417e-05,
"loss": 2.3045,
"step": 656
},
{
"epoch": 1.8494018296973962,
"grad_norm": 0.9490264654159546,
"learning_rate": 6.914572864321608e-05,
"loss": 2.3651,
"step": 657
},
{
"epoch": 1.8522167487684729,
"grad_norm": 1.07566499710083,
"learning_rate": 6.894472361809045e-05,
"loss": 2.4624,
"step": 658
},
{
"epoch": 1.8550316678395498,
"grad_norm": 1.0136849880218506,
"learning_rate": 6.874371859296482e-05,
"loss": 2.3797,
"step": 659
},
{
"epoch": 1.8578465869106262,
"grad_norm": 1.0830200910568237,
"learning_rate": 6.85427135678392e-05,
"loss": 2.4643,
"step": 660
},
{
"epoch": 1.860661505981703,
"grad_norm": 0.920754075050354,
"learning_rate": 6.834170854271357e-05,
"loss": 2.375,
"step": 661
},
{
"epoch": 1.8634764250527798,
"grad_norm": 1.0753567218780518,
"learning_rate": 6.814070351758794e-05,
"loss": 2.4276,
"step": 662
},
{
"epoch": 1.8662913441238564,
"grad_norm": 1.2756551504135132,
"learning_rate": 6.793969849246232e-05,
"loss": 2.5999,
"step": 663
},
{
"epoch": 1.8691062631949331,
"grad_norm": 1.165073037147522,
"learning_rate": 6.773869346733669e-05,
"loss": 2.5914,
"step": 664
},
{
"epoch": 1.8719211822660098,
"grad_norm": 1.0647106170654297,
"learning_rate": 6.753768844221105e-05,
"loss": 2.1331,
"step": 665
},
{
"epoch": 1.8747361013370867,
"grad_norm": 1.023997187614441,
"learning_rate": 6.733668341708544e-05,
"loss": 2.1842,
"step": 666
},
{
"epoch": 1.8775510204081631,
"grad_norm": 1.1140164136886597,
"learning_rate": 6.71356783919598e-05,
"loss": 2.3345,
"step": 667
},
{
"epoch": 1.88036593947924,
"grad_norm": 0.9038817882537842,
"learning_rate": 6.693467336683418e-05,
"loss": 2.2091,
"step": 668
},
{
"epoch": 1.8831808585503167,
"grad_norm": 1.0922759771347046,
"learning_rate": 6.673366834170854e-05,
"loss": 2.4865,
"step": 669
},
{
"epoch": 1.8859957776213934,
"grad_norm": 0.912775456905365,
"learning_rate": 6.653266331658293e-05,
"loss": 2.3812,
"step": 670
},
{
"epoch": 1.88881069669247,
"grad_norm": 1.0939160585403442,
"learning_rate": 6.633165829145729e-05,
"loss": 2.5171,
"step": 671
},
{
"epoch": 1.8916256157635467,
"grad_norm": 1.234376311302185,
"learning_rate": 6.613065326633166e-05,
"loss": 2.1198,
"step": 672
},
{
"epoch": 1.8944405348346236,
"grad_norm": 1.0135494470596313,
"learning_rate": 6.592964824120603e-05,
"loss": 2.2054,
"step": 673
},
{
"epoch": 1.8972554539057,
"grad_norm": 1.0782523155212402,
"learning_rate": 6.57286432160804e-05,
"loss": 2.4281,
"step": 674
},
{
"epoch": 1.900070372976777,
"grad_norm": 1.0938283205032349,
"learning_rate": 6.552763819095478e-05,
"loss": 2.4033,
"step": 675
},
{
"epoch": 1.9028852920478536,
"grad_norm": 1.080575942993164,
"learning_rate": 6.532663316582915e-05,
"loss": 2.2659,
"step": 676
},
{
"epoch": 1.9057002111189303,
"grad_norm": 0.9452334642410278,
"learning_rate": 6.512562814070352e-05,
"loss": 2.1833,
"step": 677
},
{
"epoch": 1.9085151301900072,
"grad_norm": 0.9910850524902344,
"learning_rate": 6.492462311557788e-05,
"loss": 2.4197,
"step": 678
},
{
"epoch": 1.9113300492610836,
"grad_norm": 0.9662689566612244,
"learning_rate": 6.472361809045227e-05,
"loss": 2.2544,
"step": 679
},
{
"epoch": 1.9141449683321605,
"grad_norm": 0.8501513004302979,
"learning_rate": 6.452261306532663e-05,
"loss": 2.3498,
"step": 680
},
{
"epoch": 1.9169598874032372,
"grad_norm": 1.2835460901260376,
"learning_rate": 6.4321608040201e-05,
"loss": 2.5167,
"step": 681
},
{
"epoch": 1.9197748064743139,
"grad_norm": 0.9385748505592346,
"learning_rate": 6.412060301507538e-05,
"loss": 2.362,
"step": 682
},
{
"epoch": 1.9225897255453905,
"grad_norm": 0.9999021887779236,
"learning_rate": 6.391959798994975e-05,
"loss": 2.2323,
"step": 683
},
{
"epoch": 1.9254046446164672,
"grad_norm": 1.0630273818969727,
"learning_rate": 6.371859296482412e-05,
"loss": 2.4489,
"step": 684
},
{
"epoch": 1.928219563687544,
"grad_norm": 0.9764763116836548,
"learning_rate": 6.35175879396985e-05,
"loss": 2.2774,
"step": 685
},
{
"epoch": 1.9310344827586206,
"grad_norm": 0.8416815996170044,
"learning_rate": 6.331658291457287e-05,
"loss": 2.2472,
"step": 686
},
{
"epoch": 1.9338494018296974,
"grad_norm": 0.8282995820045471,
"learning_rate": 6.311557788944724e-05,
"loss": 2.1936,
"step": 687
},
{
"epoch": 1.9366643209007741,
"grad_norm": 0.8734938502311707,
"learning_rate": 6.291457286432161e-05,
"loss": 2.365,
"step": 688
},
{
"epoch": 1.9394792399718508,
"grad_norm": 1.1187288761138916,
"learning_rate": 6.271356783919599e-05,
"loss": 2.3209,
"step": 689
},
{
"epoch": 1.9422941590429277,
"grad_norm": 0.9740754961967468,
"learning_rate": 6.251256281407035e-05,
"loss": 2.363,
"step": 690
},
{
"epoch": 1.9451090781140041,
"grad_norm": 1.023774266242981,
"learning_rate": 6.231155778894473e-05,
"loss": 2.2532,
"step": 691
},
{
"epoch": 1.947923997185081,
"grad_norm": 1.019603967666626,
"learning_rate": 6.211055276381909e-05,
"loss": 2.3242,
"step": 692
},
{
"epoch": 1.9507389162561575,
"grad_norm": 1.0288832187652588,
"learning_rate": 6.190954773869348e-05,
"loss": 2.6201,
"step": 693
},
{
"epoch": 1.9535538353272344,
"grad_norm": 0.9193139672279358,
"learning_rate": 6.170854271356784e-05,
"loss": 2.1794,
"step": 694
},
{
"epoch": 1.956368754398311,
"grad_norm": 0.9151753187179565,
"learning_rate": 6.150753768844222e-05,
"loss": 2.3056,
"step": 695
},
{
"epoch": 1.9591836734693877,
"grad_norm": 1.0923340320587158,
"learning_rate": 6.130653266331658e-05,
"loss": 2.2293,
"step": 696
},
{
"epoch": 1.9619985925404646,
"grad_norm": 0.9855085015296936,
"learning_rate": 6.110552763819096e-05,
"loss": 2.5036,
"step": 697
},
{
"epoch": 1.964813511611541,
"grad_norm": 0.9077695608139038,
"learning_rate": 6.090452261306533e-05,
"loss": 2.3812,
"step": 698
},
{
"epoch": 1.967628430682618,
"grad_norm": 0.9925841689109802,
"learning_rate": 6.070351758793971e-05,
"loss": 2.4093,
"step": 699
},
{
"epoch": 1.9704433497536946,
"grad_norm": 0.9586440324783325,
"learning_rate": 6.0502512562814076e-05,
"loss": 2.4851,
"step": 700
},
{
"epoch": 1.9732582688247713,
"grad_norm": 0.9627270698547363,
"learning_rate": 6.030150753768844e-05,
"loss": 2.5393,
"step": 701
},
{
"epoch": 1.976073187895848,
"grad_norm": 1.0779012441635132,
"learning_rate": 6.0100502512562815e-05,
"loss": 2.3773,
"step": 702
},
{
"epoch": 1.9788881069669246,
"grad_norm": 0.9611048698425293,
"learning_rate": 5.989949748743718e-05,
"loss": 2.3767,
"step": 703
},
{
"epoch": 1.9817030260380015,
"grad_norm": 0.9639135599136353,
"learning_rate": 5.969849246231156e-05,
"loss": 2.2936,
"step": 704
},
{
"epoch": 1.984517945109078,
"grad_norm": 0.9925483465194702,
"learning_rate": 5.949748743718593e-05,
"loss": 2.3456,
"step": 705
},
{
"epoch": 1.9873328641801549,
"grad_norm": 1.1674792766571045,
"learning_rate": 5.929648241206031e-05,
"loss": 2.3306,
"step": 706
},
{
"epoch": 1.9901477832512315,
"grad_norm": 1.0779776573181152,
"learning_rate": 5.909547738693467e-05,
"loss": 2.4675,
"step": 707
},
{
"epoch": 1.9929627023223082,
"grad_norm": 1.2033969163894653,
"learning_rate": 5.889447236180905e-05,
"loss": 2.3369,
"step": 708
},
{
"epoch": 1.995777621393385,
"grad_norm": 1.01941978931427,
"learning_rate": 5.869346733668342e-05,
"loss": 2.3271,
"step": 709
},
{
"epoch": 1.9985925404644616,
"grad_norm": 0.8707964420318604,
"learning_rate": 5.849246231155779e-05,
"loss": 2.4758,
"step": 710
},
{
"epoch": 2.0014074595355384,
"grad_norm": 0.9140713810920715,
"learning_rate": 5.829145728643216e-05,
"loss": 2.2651,
"step": 711
},
{
"epoch": 2.004222378606615,
"grad_norm": 1.016658902168274,
"learning_rate": 5.809045226130654e-05,
"loss": 1.7622,
"step": 712
},
{
"epoch": 2.007037297677692,
"grad_norm": 1.0154222249984741,
"learning_rate": 5.7889447236180904e-05,
"loss": 1.8809,
"step": 713
},
{
"epoch": 2.0098522167487687,
"grad_norm": 0.996522843837738,
"learning_rate": 5.7688442211055284e-05,
"loss": 2.1164,
"step": 714
},
{
"epoch": 2.012667135819845,
"grad_norm": 1.0130479335784912,
"learning_rate": 5.748743718592965e-05,
"loss": 2.1545,
"step": 715
},
{
"epoch": 2.015482054890922,
"grad_norm": 1.0692882537841797,
"learning_rate": 5.728643216080403e-05,
"loss": 2.0243,
"step": 716
},
{
"epoch": 2.0182969739619985,
"grad_norm": 1.0084363222122192,
"learning_rate": 5.7085427135678396e-05,
"loss": 1.984,
"step": 717
},
{
"epoch": 2.0211118930330754,
"grad_norm": 1.1399943828582764,
"learning_rate": 5.688442211055277e-05,
"loss": 1.9587,
"step": 718
},
{
"epoch": 2.023926812104152,
"grad_norm": 1.6473337411880493,
"learning_rate": 5.6683417085427135e-05,
"loss": 2.0001,
"step": 719
},
{
"epoch": 2.0267417311752287,
"grad_norm": 1.5070980787277222,
"learning_rate": 5.6482412060301515e-05,
"loss": 1.8712,
"step": 720
},
{
"epoch": 2.0295566502463056,
"grad_norm": 1.262854814529419,
"learning_rate": 5.628140703517588e-05,
"loss": 2.1427,
"step": 721
},
{
"epoch": 2.032371569317382,
"grad_norm": 1.7418184280395508,
"learning_rate": 5.608040201005026e-05,
"loss": 1.9851,
"step": 722
},
{
"epoch": 2.035186488388459,
"grad_norm": 1.5306885242462158,
"learning_rate": 5.587939698492463e-05,
"loss": 2.0147,
"step": 723
},
{
"epoch": 2.0380014074595354,
"grad_norm": 1.3408687114715576,
"learning_rate": 5.567839195979899e-05,
"loss": 2.1003,
"step": 724
},
{
"epoch": 2.0408163265306123,
"grad_norm": 1.3069605827331543,
"learning_rate": 5.547738693467337e-05,
"loss": 2.1251,
"step": 725
},
{
"epoch": 2.0436312456016887,
"grad_norm": 1.357084035873413,
"learning_rate": 5.527638190954774e-05,
"loss": 1.7681,
"step": 726
},
{
"epoch": 2.0464461646727656,
"grad_norm": 1.2578508853912354,
"learning_rate": 5.507537688442211e-05,
"loss": 1.8022,
"step": 727
},
{
"epoch": 2.0492610837438425,
"grad_norm": 1.2653518915176392,
"learning_rate": 5.487437185929648e-05,
"loss": 2.0304,
"step": 728
},
{
"epoch": 2.052076002814919,
"grad_norm": 1.2066705226898193,
"learning_rate": 5.467336683417086e-05,
"loss": 1.9383,
"step": 729
},
{
"epoch": 2.054890921885996,
"grad_norm": 1.2147313356399536,
"learning_rate": 5.4472361809045224e-05,
"loss": 1.9466,
"step": 730
},
{
"epoch": 2.0577058409570723,
"grad_norm": 1.2353148460388184,
"learning_rate": 5.4271356783919604e-05,
"loss": 2.1842,
"step": 731
},
{
"epoch": 2.060520760028149,
"grad_norm": 1.2019646167755127,
"learning_rate": 5.407035175879397e-05,
"loss": 1.9507,
"step": 732
},
{
"epoch": 2.063335679099226,
"grad_norm": 1.2473183870315552,
"learning_rate": 5.386934673366835e-05,
"loss": 1.9146,
"step": 733
},
{
"epoch": 2.0661505981703026,
"grad_norm": 1.3237521648406982,
"learning_rate": 5.3668341708542716e-05,
"loss": 2.026,
"step": 734
},
{
"epoch": 2.0689655172413794,
"grad_norm": 1.6444705724716187,
"learning_rate": 5.346733668341709e-05,
"loss": 2.1689,
"step": 735
},
{
"epoch": 2.071780436312456,
"grad_norm": 1.4315435886383057,
"learning_rate": 5.3266331658291455e-05,
"loss": 2.1981,
"step": 736
},
{
"epoch": 2.074595355383533,
"grad_norm": 1.7484960556030273,
"learning_rate": 5.3065326633165835e-05,
"loss": 1.998,
"step": 737
},
{
"epoch": 2.0774102744546092,
"grad_norm": 1.4129494428634644,
"learning_rate": 5.28643216080402e-05,
"loss": 1.9493,
"step": 738
},
{
"epoch": 2.080225193525686,
"grad_norm": 1.7426577806472778,
"learning_rate": 5.266331658291458e-05,
"loss": 1.8491,
"step": 739
},
{
"epoch": 2.083040112596763,
"grad_norm": 1.4977487325668335,
"learning_rate": 5.246231155778895e-05,
"loss": 2.0474,
"step": 740
},
{
"epoch": 2.0858550316678395,
"grad_norm": 1.479008674621582,
"learning_rate": 5.226130653266332e-05,
"loss": 2.036,
"step": 741
},
{
"epoch": 2.0886699507389164,
"grad_norm": 1.8596562147140503,
"learning_rate": 5.206030150753769e-05,
"loss": 2.0772,
"step": 742
},
{
"epoch": 2.091484869809993,
"grad_norm": 1.4239286184310913,
"learning_rate": 5.1859296482412066e-05,
"loss": 1.9441,
"step": 743
},
{
"epoch": 2.0942997888810697,
"grad_norm": 1.3117451667785645,
"learning_rate": 5.165829145728643e-05,
"loss": 1.8894,
"step": 744
},
{
"epoch": 2.097114707952146,
"grad_norm": 1.255926251411438,
"learning_rate": 5.145728643216081e-05,
"loss": 2.1199,
"step": 745
},
{
"epoch": 2.099929627023223,
"grad_norm": 1.6750807762145996,
"learning_rate": 5.125628140703518e-05,
"loss": 1.9147,
"step": 746
},
{
"epoch": 2.1027445460943,
"grad_norm": 1.307915210723877,
"learning_rate": 5.1055276381909544e-05,
"loss": 2.1592,
"step": 747
},
{
"epoch": 2.1055594651653764,
"grad_norm": 1.3630294799804688,
"learning_rate": 5.0854271356783924e-05,
"loss": 1.7529,
"step": 748
},
{
"epoch": 2.1083743842364533,
"grad_norm": 1.4674683809280396,
"learning_rate": 5.065326633165829e-05,
"loss": 1.9104,
"step": 749
},
{
"epoch": 2.1111893033075297,
"grad_norm": 1.2863240242004395,
"learning_rate": 5.045226130653266e-05,
"loss": 1.806,
"step": 750
},
{
"epoch": 2.1140042223786066,
"grad_norm": 1.4659481048583984,
"learning_rate": 5.0251256281407036e-05,
"loss": 2.1066,
"step": 751
},
{
"epoch": 2.1168191414496835,
"grad_norm": 1.4531869888305664,
"learning_rate": 5.005025125628141e-05,
"loss": 1.9145,
"step": 752
},
{
"epoch": 2.11963406052076,
"grad_norm": 1.4428577423095703,
"learning_rate": 4.984924623115578e-05,
"loss": 2.0346,
"step": 753
},
{
"epoch": 2.122448979591837,
"grad_norm": 1.6657663583755493,
"learning_rate": 4.9648241206030155e-05,
"loss": 1.8736,
"step": 754
},
{
"epoch": 2.1252638986629133,
"grad_norm": 1.45827317237854,
"learning_rate": 4.944723618090453e-05,
"loss": 1.8692,
"step": 755
},
{
"epoch": 2.12807881773399,
"grad_norm": 1.4714118242263794,
"learning_rate": 4.92462311557789e-05,
"loss": 2.1693,
"step": 756
},
{
"epoch": 2.1308937368050667,
"grad_norm": 1.4159564971923828,
"learning_rate": 4.9045226130653274e-05,
"loss": 2.0312,
"step": 757
},
{
"epoch": 2.1337086558761436,
"grad_norm": 1.409199833869934,
"learning_rate": 4.884422110552764e-05,
"loss": 2.043,
"step": 758
},
{
"epoch": 2.1365235749472204,
"grad_norm": 1.4486503601074219,
"learning_rate": 4.864321608040201e-05,
"loss": 1.8924,
"step": 759
},
{
"epoch": 2.139338494018297,
"grad_norm": 1.5751312971115112,
"learning_rate": 4.844221105527638e-05,
"loss": 1.8043,
"step": 760
},
{
"epoch": 2.142153413089374,
"grad_norm": 1.4712185859680176,
"learning_rate": 4.824120603015075e-05,
"loss": 1.9763,
"step": 761
},
{
"epoch": 2.1449683321604502,
"grad_norm": 1.3530285358428955,
"learning_rate": 4.8040201005025125e-05,
"loss": 1.7487,
"step": 762
},
{
"epoch": 2.147783251231527,
"grad_norm": 1.5095936059951782,
"learning_rate": 4.78391959798995e-05,
"loss": 2.0177,
"step": 763
},
{
"epoch": 2.1505981703026036,
"grad_norm": 1.2347254753112793,
"learning_rate": 4.763819095477387e-05,
"loss": 1.9277,
"step": 764
},
{
"epoch": 2.1534130893736805,
"grad_norm": 1.728926658630371,
"learning_rate": 4.7437185929648244e-05,
"loss": 2.0394,
"step": 765
},
{
"epoch": 2.1562280084447574,
"grad_norm": 1.3105862140655518,
"learning_rate": 4.723618090452262e-05,
"loss": 1.9132,
"step": 766
},
{
"epoch": 2.159042927515834,
"grad_norm": 1.4253538846969604,
"learning_rate": 4.703517587939698e-05,
"loss": 1.7072,
"step": 767
},
{
"epoch": 2.1618578465869107,
"grad_norm": 1.5160298347473145,
"learning_rate": 4.6834170854271356e-05,
"loss": 2.3564,
"step": 768
},
{
"epoch": 2.164672765657987,
"grad_norm": 1.384318470954895,
"learning_rate": 4.663316582914573e-05,
"loss": 1.911,
"step": 769
},
{
"epoch": 2.167487684729064,
"grad_norm": 1.6801820993423462,
"learning_rate": 4.64321608040201e-05,
"loss": 2.0506,
"step": 770
},
{
"epoch": 2.170302603800141,
"grad_norm": 1.401426076889038,
"learning_rate": 4.6231155778894475e-05,
"loss": 1.6387,
"step": 771
},
{
"epoch": 2.1731175228712174,
"grad_norm": 1.4150290489196777,
"learning_rate": 4.603015075376885e-05,
"loss": 2.1757,
"step": 772
},
{
"epoch": 2.1759324419422943,
"grad_norm": 1.909029483795166,
"learning_rate": 4.582914572864322e-05,
"loss": 1.7829,
"step": 773
},
{
"epoch": 2.1787473610133707,
"grad_norm": 1.530287265777588,
"learning_rate": 4.5628140703517594e-05,
"loss": 2.0015,
"step": 774
},
{
"epoch": 2.1815622800844476,
"grad_norm": 1.5270819664001465,
"learning_rate": 4.542713567839196e-05,
"loss": 2.1369,
"step": 775
},
{
"epoch": 2.184377199155524,
"grad_norm": 2.737819194793701,
"learning_rate": 4.522613065326633e-05,
"loss": 1.7966,
"step": 776
},
{
"epoch": 2.187192118226601,
"grad_norm": 1.4319558143615723,
"learning_rate": 4.5025125628140706e-05,
"loss": 1.949,
"step": 777
},
{
"epoch": 2.190007037297678,
"grad_norm": 1.5846929550170898,
"learning_rate": 4.482412060301508e-05,
"loss": 1.9676,
"step": 778
},
{
"epoch": 2.1928219563687543,
"grad_norm": 1.3088924884796143,
"learning_rate": 4.462311557788945e-05,
"loss": 1.9452,
"step": 779
},
{
"epoch": 2.195636875439831,
"grad_norm": 1.4991919994354248,
"learning_rate": 4.4422110552763825e-05,
"loss": 1.8966,
"step": 780
},
{
"epoch": 2.1984517945109077,
"grad_norm": 1.306575059890747,
"learning_rate": 4.42211055276382e-05,
"loss": 1.9106,
"step": 781
},
{
"epoch": 2.2012667135819846,
"grad_norm": 1.562092900276184,
"learning_rate": 4.4020100502512564e-05,
"loss": 1.8167,
"step": 782
},
{
"epoch": 2.204081632653061,
"grad_norm": 1.6543974876403809,
"learning_rate": 4.381909547738694e-05,
"loss": 1.9092,
"step": 783
},
{
"epoch": 2.206896551724138,
"grad_norm": 1.485269546508789,
"learning_rate": 4.3618090452261303e-05,
"loss": 1.9781,
"step": 784
},
{
"epoch": 2.209711470795215,
"grad_norm": 1.2385632991790771,
"learning_rate": 4.3417085427135676e-05,
"loss": 1.87,
"step": 785
},
{
"epoch": 2.2125263898662912,
"grad_norm": 1.2291756868362427,
"learning_rate": 4.321608040201005e-05,
"loss": 2.0753,
"step": 786
},
{
"epoch": 2.215341308937368,
"grad_norm": 1.3407044410705566,
"learning_rate": 4.301507537688442e-05,
"loss": 1.8011,
"step": 787
},
{
"epoch": 2.2181562280084446,
"grad_norm": 1.5226972103118896,
"learning_rate": 4.2814070351758795e-05,
"loss": 2.031,
"step": 788
},
{
"epoch": 2.2209711470795215,
"grad_norm": 1.2508612871170044,
"learning_rate": 4.261306532663317e-05,
"loss": 1.9893,
"step": 789
},
{
"epoch": 2.2237860661505984,
"grad_norm": 1.8101375102996826,
"learning_rate": 4.241206030150754e-05,
"loss": 2.0614,
"step": 790
},
{
"epoch": 2.226600985221675,
"grad_norm": 1.3492687940597534,
"learning_rate": 4.2211055276381914e-05,
"loss": 1.5472,
"step": 791
},
{
"epoch": 2.2294159042927517,
"grad_norm": 1.4373085498809814,
"learning_rate": 4.201005025125628e-05,
"loss": 1.9417,
"step": 792
},
{
"epoch": 2.232230823363828,
"grad_norm": 1.343981385231018,
"learning_rate": 4.180904522613065e-05,
"loss": 1.9317,
"step": 793
},
{
"epoch": 2.235045742434905,
"grad_norm": 1.3740363121032715,
"learning_rate": 4.1608040201005026e-05,
"loss": 1.7677,
"step": 794
},
{
"epoch": 2.2378606615059815,
"grad_norm": 1.4676454067230225,
"learning_rate": 4.14070351758794e-05,
"loss": 1.9661,
"step": 795
},
{
"epoch": 2.2406755805770584,
"grad_norm": 1.320854902267456,
"learning_rate": 4.120603015075377e-05,
"loss": 1.8218,
"step": 796
},
{
"epoch": 2.2434904996481353,
"grad_norm": 1.7027606964111328,
"learning_rate": 4.1005025125628145e-05,
"loss": 1.979,
"step": 797
},
{
"epoch": 2.2463054187192117,
"grad_norm": 1.363239049911499,
"learning_rate": 4.080402010050252e-05,
"loss": 1.8902,
"step": 798
},
{
"epoch": 2.2491203377902886,
"grad_norm": 1.707664966583252,
"learning_rate": 4.060301507537689e-05,
"loss": 2.0026,
"step": 799
},
{
"epoch": 2.251935256861365,
"grad_norm": 1.7282025814056396,
"learning_rate": 4.040201005025126e-05,
"loss": 1.8098,
"step": 800
},
{
"epoch": 2.254750175932442,
"grad_norm": 1.5891460180282593,
"learning_rate": 4.020100502512563e-05,
"loss": 1.8639,
"step": 801
},
{
"epoch": 2.2575650950035184,
"grad_norm": 1.5569334030151367,
"learning_rate": 4e-05,
"loss": 2.1027,
"step": 802
},
{
"epoch": 2.2603800140745953,
"grad_norm": 1.4195587635040283,
"learning_rate": 3.9798994974874376e-05,
"loss": 1.8757,
"step": 803
},
{
"epoch": 2.263194933145672,
"grad_norm": 1.3400124311447144,
"learning_rate": 3.959798994974875e-05,
"loss": 1.943,
"step": 804
},
{
"epoch": 2.2660098522167487,
"grad_norm": 1.4379513263702393,
"learning_rate": 3.9396984924623115e-05,
"loss": 2.0351,
"step": 805
},
{
"epoch": 2.2688247712878256,
"grad_norm": 1.8557440042495728,
"learning_rate": 3.919597989949749e-05,
"loss": 1.6522,
"step": 806
},
{
"epoch": 2.271639690358902,
"grad_norm": 1.68703031539917,
"learning_rate": 3.899497487437186e-05,
"loss": 1.8939,
"step": 807
},
{
"epoch": 2.274454609429979,
"grad_norm": 1.4797513484954834,
"learning_rate": 3.8793969849246234e-05,
"loss": 1.7212,
"step": 808
},
{
"epoch": 2.277269528501056,
"grad_norm": 2.070215940475464,
"learning_rate": 3.85929648241206e-05,
"loss": 1.9758,
"step": 809
},
{
"epoch": 2.2800844475721322,
"grad_norm": 1.4958938360214233,
"learning_rate": 3.8391959798994973e-05,
"loss": 2.0214,
"step": 810
},
{
"epoch": 2.282899366643209,
"grad_norm": 1.4052972793579102,
"learning_rate": 3.8190954773869346e-05,
"loss": 1.9959,
"step": 811
},
{
"epoch": 2.2857142857142856,
"grad_norm": 1.652631402015686,
"learning_rate": 3.798994974874372e-05,
"loss": 1.8977,
"step": 812
},
{
"epoch": 2.2885292047853625,
"grad_norm": 1.4963494539260864,
"learning_rate": 3.778894472361809e-05,
"loss": 1.7929,
"step": 813
},
{
"epoch": 2.2913441238564394,
"grad_norm": 1.554140329360962,
"learning_rate": 3.7587939698492465e-05,
"loss": 1.9422,
"step": 814
},
{
"epoch": 2.294159042927516,
"grad_norm": 1.5336120128631592,
"learning_rate": 3.738693467336684e-05,
"loss": 2.0812,
"step": 815
},
{
"epoch": 2.2969739619985927,
"grad_norm": 1.998458981513977,
"learning_rate": 3.7185929648241204e-05,
"loss": 1.8639,
"step": 816
},
{
"epoch": 2.299788881069669,
"grad_norm": 1.662591814994812,
"learning_rate": 3.698492462311558e-05,
"loss": 2.0354,
"step": 817
},
{
"epoch": 2.302603800140746,
"grad_norm": 1.6507760286331177,
"learning_rate": 3.678391959798995e-05,
"loss": 2.0235,
"step": 818
},
{
"epoch": 2.3054187192118225,
"grad_norm": 1.6057195663452148,
"learning_rate": 3.658291457286432e-05,
"loss": 1.9731,
"step": 819
},
{
"epoch": 2.3082336382828994,
"grad_norm": 1.5184822082519531,
"learning_rate": 3.6381909547738696e-05,
"loss": 1.7881,
"step": 820
},
{
"epoch": 2.311048557353976,
"grad_norm": 1.5405902862548828,
"learning_rate": 3.618090452261307e-05,
"loss": 1.9947,
"step": 821
},
{
"epoch": 2.3138634764250527,
"grad_norm": 1.880598545074463,
"learning_rate": 3.597989949748744e-05,
"loss": 1.9817,
"step": 822
},
{
"epoch": 2.3166783954961296,
"grad_norm": 1.707992434501648,
"learning_rate": 3.5778894472361815e-05,
"loss": 1.7769,
"step": 823
},
{
"epoch": 2.319493314567206,
"grad_norm": 1.4633187055587769,
"learning_rate": 3.557788944723618e-05,
"loss": 1.8575,
"step": 824
},
{
"epoch": 2.322308233638283,
"grad_norm": 1.4804046154022217,
"learning_rate": 3.5376884422110554e-05,
"loss": 1.9168,
"step": 825
},
{
"epoch": 2.3251231527093594,
"grad_norm": 1.6535083055496216,
"learning_rate": 3.517587939698493e-05,
"loss": 2.0092,
"step": 826
},
{
"epoch": 2.3279380717804363,
"grad_norm": 1.45828115940094,
"learning_rate": 3.49748743718593e-05,
"loss": 1.824,
"step": 827
},
{
"epoch": 2.330752990851513,
"grad_norm": 1.4650769233703613,
"learning_rate": 3.4773869346733667e-05,
"loss": 2.0699,
"step": 828
},
{
"epoch": 2.3335679099225897,
"grad_norm": 1.629009485244751,
"learning_rate": 3.457286432160804e-05,
"loss": 1.856,
"step": 829
},
{
"epoch": 2.3363828289936666,
"grad_norm": 1.6346815824508667,
"learning_rate": 3.437185929648241e-05,
"loss": 1.7844,
"step": 830
},
{
"epoch": 2.339197748064743,
"grad_norm": 2.298454523086548,
"learning_rate": 3.4170854271356785e-05,
"loss": 1.8711,
"step": 831
},
{
"epoch": 2.34201266713582,
"grad_norm": 1.6962236166000366,
"learning_rate": 3.396984924623116e-05,
"loss": 1.9879,
"step": 832
},
{
"epoch": 2.344827586206897,
"grad_norm": 1.5078356266021729,
"learning_rate": 3.3768844221105525e-05,
"loss": 1.6655,
"step": 833
},
{
"epoch": 2.3476425052779732,
"grad_norm": 1.9207500219345093,
"learning_rate": 3.35678391959799e-05,
"loss": 1.7975,
"step": 834
},
{
"epoch": 2.35045742434905,
"grad_norm": 1.5677213668823242,
"learning_rate": 3.336683417085427e-05,
"loss": 2.0483,
"step": 835
},
{
"epoch": 2.3532723434201266,
"grad_norm": 1.6488611698150635,
"learning_rate": 3.3165829145728643e-05,
"loss": 1.8616,
"step": 836
},
{
"epoch": 2.3560872624912035,
"grad_norm": 1.718641996383667,
"learning_rate": 3.2964824120603016e-05,
"loss": 1.9279,
"step": 837
},
{
"epoch": 2.35890218156228,
"grad_norm": 1.6529680490493774,
"learning_rate": 3.276381909547739e-05,
"loss": 2.1128,
"step": 838
},
{
"epoch": 2.361717100633357,
"grad_norm": 1.691375732421875,
"learning_rate": 3.256281407035176e-05,
"loss": 1.801,
"step": 839
},
{
"epoch": 2.3645320197044333,
"grad_norm": 1.6755695343017578,
"learning_rate": 3.2361809045226135e-05,
"loss": 2.0209,
"step": 840
},
{
"epoch": 2.36734693877551,
"grad_norm": 1.3565911054611206,
"learning_rate": 3.21608040201005e-05,
"loss": 1.8019,
"step": 841
},
{
"epoch": 2.370161857846587,
"grad_norm": 1.4155783653259277,
"learning_rate": 3.1959798994974875e-05,
"loss": 1.9121,
"step": 842
},
{
"epoch": 2.3729767769176635,
"grad_norm": 1.6148691177368164,
"learning_rate": 3.175879396984925e-05,
"loss": 1.9028,
"step": 843
},
{
"epoch": 2.3757916959887404,
"grad_norm": 1.408504605293274,
"learning_rate": 3.155778894472362e-05,
"loss": 1.7751,
"step": 844
},
{
"epoch": 2.378606615059817,
"grad_norm": 1.4195948839187622,
"learning_rate": 3.1356783919597993e-05,
"loss": 1.8083,
"step": 845
},
{
"epoch": 2.3814215341308937,
"grad_norm": 1.5938619375228882,
"learning_rate": 3.1155778894472366e-05,
"loss": 1.8096,
"step": 846
},
{
"epoch": 2.3842364532019706,
"grad_norm": 1.4775474071502686,
"learning_rate": 3.095477386934674e-05,
"loss": 2.0482,
"step": 847
},
{
"epoch": 2.387051372273047,
"grad_norm": 1.3416311740875244,
"learning_rate": 3.075376884422111e-05,
"loss": 1.8107,
"step": 848
},
{
"epoch": 2.389866291344124,
"grad_norm": 1.6304892301559448,
"learning_rate": 3.055276381909548e-05,
"loss": 1.913,
"step": 849
},
{
"epoch": 2.3926812104152004,
"grad_norm": 1.2348568439483643,
"learning_rate": 3.0351758793969855e-05,
"loss": 1.8254,
"step": 850
},
{
"epoch": 2.3954961294862773,
"grad_norm": 1.6292930841445923,
"learning_rate": 3.015075376884422e-05,
"loss": 2.1953,
"step": 851
},
{
"epoch": 2.398311048557354,
"grad_norm": 1.6192723512649536,
"learning_rate": 2.994974874371859e-05,
"loss": 1.7694,
"step": 852
},
{
"epoch": 2.4011259676284307,
"grad_norm": 1.6231796741485596,
"learning_rate": 2.9748743718592964e-05,
"loss": 2.1208,
"step": 853
},
{
"epoch": 2.4039408866995076,
"grad_norm": 1.3113828897476196,
"learning_rate": 2.9547738693467337e-05,
"loss": 1.9544,
"step": 854
},
{
"epoch": 2.406755805770584,
"grad_norm": 1.6705840826034546,
"learning_rate": 2.934673366834171e-05,
"loss": 2.0316,
"step": 855
},
{
"epoch": 2.409570724841661,
"grad_norm": 1.7489991188049316,
"learning_rate": 2.914572864321608e-05,
"loss": 2.1702,
"step": 856
},
{
"epoch": 2.4123856439127374,
"grad_norm": 1.7634392976760864,
"learning_rate": 2.8944723618090452e-05,
"loss": 2.003,
"step": 857
},
{
"epoch": 2.4152005629838142,
"grad_norm": 1.607228398323059,
"learning_rate": 2.8743718592964825e-05,
"loss": 1.9383,
"step": 858
},
{
"epoch": 2.4180154820548907,
"grad_norm": 1.661271095275879,
"learning_rate": 2.8542713567839198e-05,
"loss": 1.9874,
"step": 859
},
{
"epoch": 2.4208304011259676,
"grad_norm": 1.4608184099197388,
"learning_rate": 2.8341708542713568e-05,
"loss": 1.8086,
"step": 860
},
{
"epoch": 2.4236453201970445,
"grad_norm": 1.4614999294281006,
"learning_rate": 2.814070351758794e-05,
"loss": 2.069,
"step": 861
},
{
"epoch": 2.426460239268121,
"grad_norm": 1.5575437545776367,
"learning_rate": 2.7939698492462314e-05,
"loss": 1.8134,
"step": 862
},
{
"epoch": 2.429275158339198,
"grad_norm": 1.5272866487503052,
"learning_rate": 2.7738693467336686e-05,
"loss": 1.9724,
"step": 863
},
{
"epoch": 2.4320900774102743,
"grad_norm": 1.524636149406433,
"learning_rate": 2.7537688442211056e-05,
"loss": 1.8841,
"step": 864
},
{
"epoch": 2.434904996481351,
"grad_norm": 1.7057536840438843,
"learning_rate": 2.733668341708543e-05,
"loss": 1.9102,
"step": 865
},
{
"epoch": 2.437719915552428,
"grad_norm": 1.4803720712661743,
"learning_rate": 2.7135678391959802e-05,
"loss": 1.8558,
"step": 866
},
{
"epoch": 2.4405348346235045,
"grad_norm": 1.486907958984375,
"learning_rate": 2.6934673366834175e-05,
"loss": 2.0651,
"step": 867
},
{
"epoch": 2.4433497536945814,
"grad_norm": 1.5045924186706543,
"learning_rate": 2.6733668341708545e-05,
"loss": 1.8523,
"step": 868
},
{
"epoch": 2.446164672765658,
"grad_norm": 1.5075145959854126,
"learning_rate": 2.6532663316582917e-05,
"loss": 2.1232,
"step": 869
},
{
"epoch": 2.4489795918367347,
"grad_norm": 1.5650744438171387,
"learning_rate": 2.633165829145729e-05,
"loss": 1.9524,
"step": 870
},
{
"epoch": 2.4517945109078116,
"grad_norm": 1.4480630159378052,
"learning_rate": 2.613065326633166e-05,
"loss": 1.8816,
"step": 871
},
{
"epoch": 2.454609429978888,
"grad_norm": 1.7436559200286865,
"learning_rate": 2.5929648241206033e-05,
"loss": 2.0141,
"step": 872
},
{
"epoch": 2.457424349049965,
"grad_norm": 1.647824764251709,
"learning_rate": 2.5728643216080406e-05,
"loss": 1.9715,
"step": 873
},
{
"epoch": 2.4602392681210414,
"grad_norm": 1.5032564401626587,
"learning_rate": 2.5527638190954772e-05,
"loss": 1.9423,
"step": 874
},
{
"epoch": 2.4630541871921183,
"grad_norm": 1.615399718284607,
"learning_rate": 2.5326633165829145e-05,
"loss": 2.1547,
"step": 875
},
{
"epoch": 2.4658691062631948,
"grad_norm": 1.66806161403656,
"learning_rate": 2.5125628140703518e-05,
"loss": 2.0234,
"step": 876
},
{
"epoch": 2.4686840253342717,
"grad_norm": 1.69028902053833,
"learning_rate": 2.492462311557789e-05,
"loss": 1.9796,
"step": 877
},
{
"epoch": 2.471498944405348,
"grad_norm": 1.5616704225540161,
"learning_rate": 2.4723618090452264e-05,
"loss": 1.979,
"step": 878
},
{
"epoch": 2.474313863476425,
"grad_norm": 1.9459314346313477,
"learning_rate": 2.4522613065326637e-05,
"loss": 2.0534,
"step": 879
},
{
"epoch": 2.477128782547502,
"grad_norm": 1.2826955318450928,
"learning_rate": 2.4321608040201007e-05,
"loss": 1.9959,
"step": 880
},
{
"epoch": 2.4799437016185784,
"grad_norm": 1.3462079763412476,
"learning_rate": 2.4120603015075376e-05,
"loss": 1.7451,
"step": 881
},
{
"epoch": 2.4827586206896552,
"grad_norm": 1.7991423606872559,
"learning_rate": 2.391959798994975e-05,
"loss": 2.1944,
"step": 882
},
{
"epoch": 2.4855735397607317,
"grad_norm": 1.4024704694747925,
"learning_rate": 2.3718592964824122e-05,
"loss": 1.9266,
"step": 883
},
{
"epoch": 2.4883884588318086,
"grad_norm": 1.7428147792816162,
"learning_rate": 2.351758793969849e-05,
"loss": 1.991,
"step": 884
},
{
"epoch": 2.4912033779028855,
"grad_norm": 1.4942609071731567,
"learning_rate": 2.3316582914572865e-05,
"loss": 1.7034,
"step": 885
},
{
"epoch": 2.494018296973962,
"grad_norm": 1.6050865650177002,
"learning_rate": 2.3115577889447238e-05,
"loss": 1.9718,
"step": 886
},
{
"epoch": 2.496833216045039,
"grad_norm": 1.6679102182388306,
"learning_rate": 2.291457286432161e-05,
"loss": 1.7724,
"step": 887
},
{
"epoch": 2.4996481351161153,
"grad_norm": 1.4811137914657593,
"learning_rate": 2.271356783919598e-05,
"loss": 1.9245,
"step": 888
},
{
"epoch": 2.502463054187192,
"grad_norm": 1.5758980512619019,
"learning_rate": 2.2512562814070353e-05,
"loss": 1.9094,
"step": 889
},
{
"epoch": 2.505277973258269,
"grad_norm": 1.6346875429153442,
"learning_rate": 2.2311557788944726e-05,
"loss": 1.8482,
"step": 890
},
{
"epoch": 2.5080928923293455,
"grad_norm": 1.6329705715179443,
"learning_rate": 2.21105527638191e-05,
"loss": 1.9275,
"step": 891
},
{
"epoch": 2.510907811400422,
"grad_norm": 1.9860655069351196,
"learning_rate": 2.190954773869347e-05,
"loss": 2.0708,
"step": 892
},
{
"epoch": 2.513722730471499,
"grad_norm": 2.201899528503418,
"learning_rate": 2.1708542713567838e-05,
"loss": 1.5935,
"step": 893
},
{
"epoch": 2.5165376495425757,
"grad_norm": 1.7361814975738525,
"learning_rate": 2.150753768844221e-05,
"loss": 1.716,
"step": 894
},
{
"epoch": 2.519352568613652,
"grad_norm": 1.6891804933547974,
"learning_rate": 2.1306532663316584e-05,
"loss": 2.0822,
"step": 895
},
{
"epoch": 2.522167487684729,
"grad_norm": 1.5002251863479614,
"learning_rate": 2.1105527638190957e-05,
"loss": 1.8622,
"step": 896
},
{
"epoch": 2.5249824067558055,
"grad_norm": 1.6818735599517822,
"learning_rate": 2.0904522613065327e-05,
"loss": 1.7687,
"step": 897
},
{
"epoch": 2.5277973258268824,
"grad_norm": 1.6019138097763062,
"learning_rate": 2.07035175879397e-05,
"loss": 1.8481,
"step": 898
},
{
"epoch": 2.5306122448979593,
"grad_norm": 1.517175555229187,
"learning_rate": 2.0502512562814073e-05,
"loss": 1.8791,
"step": 899
},
{
"epoch": 2.533427163969036,
"grad_norm": 1.4796918630599976,
"learning_rate": 2.0301507537688446e-05,
"loss": 1.8031,
"step": 900
},
{
"epoch": 2.5362420830401127,
"grad_norm": 1.5934321880340576,
"learning_rate": 2.0100502512562815e-05,
"loss": 1.9215,
"step": 901
},
{
"epoch": 2.539057002111189,
"grad_norm": 1.5581581592559814,
"learning_rate": 1.9899497487437188e-05,
"loss": 1.9739,
"step": 902
},
{
"epoch": 2.541871921182266,
"grad_norm": 1.6254914999008179,
"learning_rate": 1.9698492462311558e-05,
"loss": 2.0283,
"step": 903
},
{
"epoch": 2.544686840253343,
"grad_norm": 1.5393351316452026,
"learning_rate": 1.949748743718593e-05,
"loss": 2.1366,
"step": 904
},
{
"epoch": 2.5475017593244194,
"grad_norm": 1.369229793548584,
"learning_rate": 1.92964824120603e-05,
"loss": 1.8624,
"step": 905
},
{
"epoch": 2.5503166783954963,
"grad_norm": 1.5214154720306396,
"learning_rate": 1.9095477386934673e-05,
"loss": 1.8812,
"step": 906
},
{
"epoch": 2.5531315974665727,
"grad_norm": 1.7752878665924072,
"learning_rate": 1.8894472361809046e-05,
"loss": 1.959,
"step": 907
},
{
"epoch": 2.5559465165376496,
"grad_norm": 1.5053621530532837,
"learning_rate": 1.869346733668342e-05,
"loss": 1.895,
"step": 908
},
{
"epoch": 2.5587614356087265,
"grad_norm": 1.660584568977356,
"learning_rate": 1.849246231155779e-05,
"loss": 1.8461,
"step": 909
},
{
"epoch": 2.561576354679803,
"grad_norm": 1.5243057012557983,
"learning_rate": 1.829145728643216e-05,
"loss": 1.7571,
"step": 910
},
{
"epoch": 2.5643912737508794,
"grad_norm": 1.4486278295516968,
"learning_rate": 1.8090452261306535e-05,
"loss": 1.9905,
"step": 911
},
{
"epoch": 2.5672061928219563,
"grad_norm": 1.5734102725982666,
"learning_rate": 1.7889447236180908e-05,
"loss": 1.8319,
"step": 912
},
{
"epoch": 2.570021111893033,
"grad_norm": 1.524849534034729,
"learning_rate": 1.7688442211055277e-05,
"loss": 1.8446,
"step": 913
},
{
"epoch": 2.5728360309641096,
"grad_norm": 1.4271085262298584,
"learning_rate": 1.748743718592965e-05,
"loss": 1.932,
"step": 914
},
{
"epoch": 2.5756509500351865,
"grad_norm": 1.4514641761779785,
"learning_rate": 1.728643216080402e-05,
"loss": 1.8912,
"step": 915
},
{
"epoch": 2.578465869106263,
"grad_norm": 1.5679149627685547,
"learning_rate": 1.7085427135678393e-05,
"loss": 1.8389,
"step": 916
},
{
"epoch": 2.58128078817734,
"grad_norm": 1.628262996673584,
"learning_rate": 1.6884422110552762e-05,
"loss": 1.7108,
"step": 917
},
{
"epoch": 2.5840957072484168,
"grad_norm": 1.466387152671814,
"learning_rate": 1.6683417085427135e-05,
"loss": 1.7445,
"step": 918
},
{
"epoch": 2.586910626319493,
"grad_norm": 1.6148653030395508,
"learning_rate": 1.6482412060301508e-05,
"loss": 1.8271,
"step": 919
},
{
"epoch": 2.58972554539057,
"grad_norm": 1.6727656126022339,
"learning_rate": 1.628140703517588e-05,
"loss": 1.8221,
"step": 920
},
{
"epoch": 2.5925404644616465,
"grad_norm": 1.6274527311325073,
"learning_rate": 1.608040201005025e-05,
"loss": 1.9275,
"step": 921
},
{
"epoch": 2.5953553835327234,
"grad_norm": 1.5122441053390503,
"learning_rate": 1.5879396984924624e-05,
"loss": 1.7531,
"step": 922
},
{
"epoch": 2.5981703026038003,
"grad_norm": 1.5030601024627686,
"learning_rate": 1.5678391959798997e-05,
"loss": 1.9965,
"step": 923
},
{
"epoch": 2.600985221674877,
"grad_norm": 1.7044039964675903,
"learning_rate": 1.547738693467337e-05,
"loss": 1.946,
"step": 924
},
{
"epoch": 2.6038001407459537,
"grad_norm": 1.505894422531128,
"learning_rate": 1.527638190954774e-05,
"loss": 1.8394,
"step": 925
},
{
"epoch": 2.60661505981703,
"grad_norm": 1.5264232158660889,
"learning_rate": 1.507537688442211e-05,
"loss": 1.8762,
"step": 926
},
{
"epoch": 2.609429978888107,
"grad_norm": 1.512060284614563,
"learning_rate": 1.4874371859296482e-05,
"loss": 1.9039,
"step": 927
},
{
"epoch": 2.612244897959184,
"grad_norm": 1.6046111583709717,
"learning_rate": 1.4673366834170855e-05,
"loss": 1.8746,
"step": 928
},
{
"epoch": 2.6150598170302604,
"grad_norm": 1.7210888862609863,
"learning_rate": 1.4472361809045226e-05,
"loss": 1.8613,
"step": 929
},
{
"epoch": 2.6178747361013373,
"grad_norm": 1.7266684770584106,
"learning_rate": 1.4271356783919599e-05,
"loss": 1.966,
"step": 930
},
{
"epoch": 2.6206896551724137,
"grad_norm": 1.6090869903564453,
"learning_rate": 1.407035175879397e-05,
"loss": 1.8954,
"step": 931
},
{
"epoch": 2.6235045742434906,
"grad_norm": 1.6034605503082275,
"learning_rate": 1.3869346733668343e-05,
"loss": 1.9799,
"step": 932
},
{
"epoch": 2.626319493314567,
"grad_norm": 1.8251813650131226,
"learning_rate": 1.3668341708542715e-05,
"loss": 1.8237,
"step": 933
},
{
"epoch": 2.629134412385644,
"grad_norm": 1.9154014587402344,
"learning_rate": 1.3467336683417087e-05,
"loss": 1.7706,
"step": 934
},
{
"epoch": 2.6319493314567204,
"grad_norm": 1.6452045440673828,
"learning_rate": 1.3266331658291459e-05,
"loss": 2.212,
"step": 935
},
{
"epoch": 2.6347642505277973,
"grad_norm": 1.5571489334106445,
"learning_rate": 1.306532663316583e-05,
"loss": 1.7057,
"step": 936
},
{
"epoch": 2.637579169598874,
"grad_norm": 1.5120084285736084,
"learning_rate": 1.2864321608040203e-05,
"loss": 1.958,
"step": 937
},
{
"epoch": 2.6403940886699506,
"grad_norm": 1.4263646602630615,
"learning_rate": 1.2663316582914573e-05,
"loss": 2.2181,
"step": 938
},
{
"epoch": 2.6432090077410275,
"grad_norm": 1.7924742698669434,
"learning_rate": 1.2462311557788946e-05,
"loss": 2.0682,
"step": 939
},
{
"epoch": 2.646023926812104,
"grad_norm": 1.568236231803894,
"learning_rate": 1.2261306532663318e-05,
"loss": 1.9372,
"step": 940
},
{
"epoch": 2.648838845883181,
"grad_norm": 1.826130986213684,
"learning_rate": 1.2060301507537688e-05,
"loss": 1.8557,
"step": 941
},
{
"epoch": 2.6516537649542578,
"grad_norm": 1.4434233903884888,
"learning_rate": 1.1859296482412061e-05,
"loss": 1.7765,
"step": 942
},
{
"epoch": 2.654468684025334,
"grad_norm": 1.4508579969406128,
"learning_rate": 1.1658291457286432e-05,
"loss": 1.9047,
"step": 943
},
{
"epoch": 2.657283603096411,
"grad_norm": 1.6055655479431152,
"learning_rate": 1.1457286432160805e-05,
"loss": 1.9673,
"step": 944
},
{
"epoch": 2.6600985221674875,
"grad_norm": 1.7329376935958862,
"learning_rate": 1.1256281407035177e-05,
"loss": 1.9288,
"step": 945
},
{
"epoch": 2.6629134412385644,
"grad_norm": 2.1003239154815674,
"learning_rate": 1.105527638190955e-05,
"loss": 1.9618,
"step": 946
},
{
"epoch": 2.6657283603096413,
"grad_norm": 1.5641478300094604,
"learning_rate": 1.0854271356783919e-05,
"loss": 1.9631,
"step": 947
},
{
"epoch": 2.668543279380718,
"grad_norm": 1.5357648134231567,
"learning_rate": 1.0653266331658292e-05,
"loss": 2.0265,
"step": 948
},
{
"epoch": 2.6713581984517947,
"grad_norm": 1.3177186250686646,
"learning_rate": 1.0452261306532663e-05,
"loss": 1.878,
"step": 949
},
{
"epoch": 2.674173117522871,
"grad_norm": 1.499448299407959,
"learning_rate": 1.0251256281407036e-05,
"loss": 1.8745,
"step": 950
},
{
"epoch": 2.676988036593948,
"grad_norm": 1.6168919801712036,
"learning_rate": 1.0050251256281408e-05,
"loss": 2.0069,
"step": 951
},
{
"epoch": 2.6798029556650245,
"grad_norm": 1.6026453971862793,
"learning_rate": 9.849246231155779e-06,
"loss": 1.7609,
"step": 952
},
{
"epoch": 2.6826178747361014,
"grad_norm": 1.796265721321106,
"learning_rate": 9.64824120603015e-06,
"loss": 2.0887,
"step": 953
},
{
"epoch": 2.685432793807178,
"grad_norm": 1.5531530380249023,
"learning_rate": 9.447236180904523e-06,
"loss": 1.8629,
"step": 954
},
{
"epoch": 2.6882477128782547,
"grad_norm": 1.753846287727356,
"learning_rate": 9.246231155778894e-06,
"loss": 1.8199,
"step": 955
},
{
"epoch": 2.6910626319493316,
"grad_norm": 1.5960685014724731,
"learning_rate": 9.045226130653267e-06,
"loss": 2.0348,
"step": 956
},
{
"epoch": 2.693877551020408,
"grad_norm": 1.6763644218444824,
"learning_rate": 8.844221105527639e-06,
"loss": 1.8563,
"step": 957
},
{
"epoch": 2.696692470091485,
"grad_norm": 1.4198554754257202,
"learning_rate": 8.64321608040201e-06,
"loss": 1.7267,
"step": 958
},
{
"epoch": 2.6995073891625614,
"grad_norm": 1.6178436279296875,
"learning_rate": 8.442211055276381e-06,
"loss": 2.0219,
"step": 959
},
{
"epoch": 2.7023223082336383,
"grad_norm": 1.5461405515670776,
"learning_rate": 8.241206030150754e-06,
"loss": 1.8272,
"step": 960
},
{
"epoch": 2.705137227304715,
"grad_norm": 1.7036101818084717,
"learning_rate": 8.040201005025125e-06,
"loss": 1.9612,
"step": 961
},
{
"epoch": 2.7079521463757916,
"grad_norm": 1.540170669555664,
"learning_rate": 7.839195979899498e-06,
"loss": 1.8085,
"step": 962
},
{
"epoch": 2.7107670654468685,
"grad_norm": 1.4888712167739868,
"learning_rate": 7.63819095477387e-06,
"loss": 1.9548,
"step": 963
},
{
"epoch": 2.713581984517945,
"grad_norm": 1.6294909715652466,
"learning_rate": 7.437185929648241e-06,
"loss": 1.9253,
"step": 964
},
{
"epoch": 2.716396903589022,
"grad_norm": 1.50801682472229,
"learning_rate": 7.236180904522613e-06,
"loss": 2.0657,
"step": 965
},
{
"epoch": 2.7192118226600988,
"grad_norm": 1.7348463535308838,
"learning_rate": 7.035175879396985e-06,
"loss": 1.9505,
"step": 966
},
{
"epoch": 2.722026741731175,
"grad_norm": 1.63502836227417,
"learning_rate": 6.834170854271357e-06,
"loss": 1.9629,
"step": 967
},
{
"epoch": 2.724841660802252,
"grad_norm": 1.6600550413131714,
"learning_rate": 6.633165829145729e-06,
"loss": 1.8303,
"step": 968
},
{
"epoch": 2.7276565798733285,
"grad_norm": 2.031532049179077,
"learning_rate": 6.4321608040201015e-06,
"loss": 1.7822,
"step": 969
},
{
"epoch": 2.7304714989444054,
"grad_norm": 1.804951548576355,
"learning_rate": 6.231155778894473e-06,
"loss": 2.1692,
"step": 970
},
{
"epoch": 2.733286418015482,
"grad_norm": 1.4661808013916016,
"learning_rate": 6.030150753768844e-06,
"loss": 1.964,
"step": 971
},
{
"epoch": 2.736101337086559,
"grad_norm": 2.2973814010620117,
"learning_rate": 5.829145728643216e-06,
"loss": 1.7313,
"step": 972
},
{
"epoch": 2.7389162561576352,
"grad_norm": 1.6403539180755615,
"learning_rate": 5.628140703517588e-06,
"loss": 1.9302,
"step": 973
},
{
"epoch": 2.741731175228712,
"grad_norm": 1.5790972709655762,
"learning_rate": 5.4271356783919595e-06,
"loss": 1.6091,
"step": 974
},
{
"epoch": 2.744546094299789,
"grad_norm": 1.765395998954773,
"learning_rate": 5.226130653266332e-06,
"loss": 2.0181,
"step": 975
},
{
"epoch": 2.7473610133708655,
"grad_norm": 1.4669338464736938,
"learning_rate": 5.025125628140704e-06,
"loss": 1.8627,
"step": 976
},
{
"epoch": 2.7501759324419424,
"grad_norm": 1.678789734840393,
"learning_rate": 4.824120603015075e-06,
"loss": 2.0305,
"step": 977
},
{
"epoch": 2.752990851513019,
"grad_norm": 1.5381121635437012,
"learning_rate": 4.623115577889447e-06,
"loss": 1.8813,
"step": 978
},
{
"epoch": 2.7558057705840957,
"grad_norm": 1.5611159801483154,
"learning_rate": 4.422110552763819e-06,
"loss": 1.7127,
"step": 979
},
{
"epoch": 2.7586206896551726,
"grad_norm": 1.6193746328353882,
"learning_rate": 4.2211055276381906e-06,
"loss": 1.9128,
"step": 980
},
{
"epoch": 2.761435608726249,
"grad_norm": 1.4393192529678345,
"learning_rate": 4.020100502512563e-06,
"loss": 1.5317,
"step": 981
},
{
"epoch": 2.764250527797326,
"grad_norm": 1.6281440258026123,
"learning_rate": 3.819095477386935e-06,
"loss": 1.9902,
"step": 982
},
{
"epoch": 2.7670654468684024,
"grad_norm": 1.6578021049499512,
"learning_rate": 3.6180904522613065e-06,
"loss": 2.2476,
"step": 983
},
{
"epoch": 2.7698803659394793,
"grad_norm": 1.6723147630691528,
"learning_rate": 3.4170854271356786e-06,
"loss": 1.9034,
"step": 984
},
{
"epoch": 2.772695285010556,
"grad_norm": 1.6200298070907593,
"learning_rate": 3.2160804020100507e-06,
"loss": 2.129,
"step": 985
},
{
"epoch": 2.7755102040816326,
"grad_norm": 1.6610552072525024,
"learning_rate": 3.015075376884422e-06,
"loss": 1.9335,
"step": 986
},
{
"epoch": 2.7783251231527095,
"grad_norm": 1.7721863985061646,
"learning_rate": 2.814070351758794e-06,
"loss": 1.888,
"step": 987
},
{
"epoch": 2.781140042223786,
"grad_norm": 1.949180006980896,
"learning_rate": 2.613065326633166e-06,
"loss": 2.0525,
"step": 988
},
{
"epoch": 2.783954961294863,
"grad_norm": 1.5390926599502563,
"learning_rate": 2.4120603015075375e-06,
"loss": 1.981,
"step": 989
},
{
"epoch": 2.7867698803659398,
"grad_norm": 1.7333327531814575,
"learning_rate": 2.2110552763819096e-06,
"loss": 1.7485,
"step": 990
},
{
"epoch": 2.789584799437016,
"grad_norm": 1.438262939453125,
"learning_rate": 2.0100502512562813e-06,
"loss": 1.8643,
"step": 991
},
{
"epoch": 2.7923997185080927,
"grad_norm": 1.7008702754974365,
"learning_rate": 1.8090452261306533e-06,
"loss": 2.0458,
"step": 992
},
{
"epoch": 2.7952146375791695,
"grad_norm": 1.8295824527740479,
"learning_rate": 1.6080402010050254e-06,
"loss": 2.0128,
"step": 993
},
{
"epoch": 2.7980295566502464,
"grad_norm": 1.8252149820327759,
"learning_rate": 1.407035175879397e-06,
"loss": 1.724,
"step": 994
},
{
"epoch": 2.800844475721323,
"grad_norm": 1.612557291984558,
"learning_rate": 1.2060301507537688e-06,
"loss": 1.9216,
"step": 995
},
{
"epoch": 2.8036593947924,
"grad_norm": 1.486989974975586,
"learning_rate": 1.0050251256281407e-06,
"loss": 1.6633,
"step": 996
},
{
"epoch": 2.8064743138634762,
"grad_norm": 1.5488345623016357,
"learning_rate": 8.040201005025127e-07,
"loss": 1.8513,
"step": 997
},
{
"epoch": 2.809289232934553,
"grad_norm": 1.741253137588501,
"learning_rate": 6.030150753768844e-07,
"loss": 1.7444,
"step": 998
},
{
"epoch": 2.81210415200563,
"grad_norm": 1.6252341270446777,
"learning_rate": 4.0201005025125634e-07,
"loss": 1.9627,
"step": 999
},
{
"epoch": 2.8149190710767065,
"grad_norm": 1.5533764362335205,
"learning_rate": 2.0100502512562817e-07,
"loss": 1.8062,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.447791767273472e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}