olmo-code-python2-3-tagged / trainer_state.json
dipikakhullar's picture
Upload folder using huggingface_hub
047aa14 verified
{
"best_global_step": 2100,
"best_metric": 1.0858707427978516,
"best_model_checkpoint": "./outputs/checkpoint-2100",
"epoch": 0.16188870151770657,
"eval_steps": 100,
"global_step": 2100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00015417971573114913,
"grad_norm": 1.2087944746017456,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.8689,
"step": 2
},
{
"epoch": 0.00030835943146229826,
"grad_norm": 1.2666666507720947,
"learning_rate": 6e-06,
"loss": 1.7785,
"step": 4
},
{
"epoch": 0.00046253914719344736,
"grad_norm": 0.7307026982307434,
"learning_rate": 1e-05,
"loss": 1.6809,
"step": 6
},
{
"epoch": 0.0006167188629245965,
"grad_norm": 1.2569252252578735,
"learning_rate": 1.4000000000000001e-05,
"loss": 1.9048,
"step": 8
},
{
"epoch": 0.0007708985786557456,
"grad_norm": 0.9572980403900146,
"learning_rate": 1.8e-05,
"loss": 1.7574,
"step": 10
},
{
"epoch": 0.0009250782943868947,
"grad_norm": 0.9918506145477295,
"learning_rate": 2.2000000000000003e-05,
"loss": 1.858,
"step": 12
},
{
"epoch": 0.0010792580101180438,
"grad_norm": 0.9316955208778381,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.8238,
"step": 14
},
{
"epoch": 0.001233437725849193,
"grad_norm": 0.8265096545219421,
"learning_rate": 3e-05,
"loss": 1.6852,
"step": 16
},
{
"epoch": 0.001387617441580342,
"grad_norm": 0.900516152381897,
"learning_rate": 3.4000000000000007e-05,
"loss": 1.8227,
"step": 18
},
{
"epoch": 0.0015417971573114912,
"grad_norm": 0.9343056678771973,
"learning_rate": 3.8e-05,
"loss": 1.7732,
"step": 20
},
{
"epoch": 0.0016959768730426404,
"grad_norm": 0.8314495086669922,
"learning_rate": 4.2e-05,
"loss": 1.732,
"step": 22
},
{
"epoch": 0.0018501565887737894,
"grad_norm": 0.8370314240455627,
"learning_rate": 4.600000000000001e-05,
"loss": 1.6725,
"step": 24
},
{
"epoch": 0.0020043363045049384,
"grad_norm": 0.6678845286369324,
"learning_rate": 5e-05,
"loss": 1.5638,
"step": 26
},
{
"epoch": 0.0021585160202360876,
"grad_norm": 0.6469596028327942,
"learning_rate": 5.4000000000000005e-05,
"loss": 1.6414,
"step": 28
},
{
"epoch": 0.002312695735967237,
"grad_norm": 1.1161589622497559,
"learning_rate": 5.8e-05,
"loss": 1.6015,
"step": 30
},
{
"epoch": 0.002466875451698386,
"grad_norm": 0.6085391044616699,
"learning_rate": 6.2e-05,
"loss": 1.4577,
"step": 32
},
{
"epoch": 0.0026210551674295353,
"grad_norm": 0.7159522175788879,
"learning_rate": 6.6e-05,
"loss": 1.4667,
"step": 34
},
{
"epoch": 0.002775234883160684,
"grad_norm": 0.67247074842453,
"learning_rate": 7e-05,
"loss": 1.5619,
"step": 36
},
{
"epoch": 0.0029294145988918332,
"grad_norm": 0.6272625923156738,
"learning_rate": 7.4e-05,
"loss": 1.322,
"step": 38
},
{
"epoch": 0.0030835943146229824,
"grad_norm": 0.7291163206100464,
"learning_rate": 7.800000000000001e-05,
"loss": 1.3936,
"step": 40
},
{
"epoch": 0.0032377740303541317,
"grad_norm": 0.4980190396308899,
"learning_rate": 8.2e-05,
"loss": 1.3322,
"step": 42
},
{
"epoch": 0.003391953746085281,
"grad_norm": 1.032578945159912,
"learning_rate": 8.6e-05,
"loss": 1.3657,
"step": 44
},
{
"epoch": 0.0035461334618164296,
"grad_norm": 0.5118615031242371,
"learning_rate": 9e-05,
"loss": 1.2866,
"step": 46
},
{
"epoch": 0.003700313177547579,
"grad_norm": 0.5234407782554626,
"learning_rate": 9.4e-05,
"loss": 1.2806,
"step": 48
},
{
"epoch": 0.003854492893278728,
"grad_norm": 0.49764135479927063,
"learning_rate": 9.8e-05,
"loss": 1.2004,
"step": 50
},
{
"epoch": 0.004008672609009877,
"grad_norm": 0.34377485513687134,
"learning_rate": 0.00010200000000000001,
"loss": 1.1947,
"step": 52
},
{
"epoch": 0.0041628523247410265,
"grad_norm": 0.41426530480384827,
"learning_rate": 0.00010600000000000002,
"loss": 1.2689,
"step": 54
},
{
"epoch": 0.004317032040472175,
"grad_norm": 0.5027992129325867,
"learning_rate": 0.00011000000000000002,
"loss": 1.2249,
"step": 56
},
{
"epoch": 0.004471211756203325,
"grad_norm": 0.44335752725601196,
"learning_rate": 0.00011399999999999999,
"loss": 1.2771,
"step": 58
},
{
"epoch": 0.004625391471934474,
"grad_norm": 0.3176646828651428,
"learning_rate": 0.000118,
"loss": 1.1873,
"step": 60
},
{
"epoch": 0.0047795711876656224,
"grad_norm": 0.24802716076374054,
"learning_rate": 0.000122,
"loss": 1.1989,
"step": 62
},
{
"epoch": 0.004933750903396772,
"grad_norm": 0.23831751942634583,
"learning_rate": 0.000126,
"loss": 1.1093,
"step": 64
},
{
"epoch": 0.005087930619127921,
"grad_norm": 0.24024009704589844,
"learning_rate": 0.00013000000000000002,
"loss": 1.2196,
"step": 66
},
{
"epoch": 0.0052421103348590705,
"grad_norm": 0.2745237350463867,
"learning_rate": 0.000134,
"loss": 1.1802,
"step": 68
},
{
"epoch": 0.005396290050590219,
"grad_norm": 0.27817806601524353,
"learning_rate": 0.000138,
"loss": 1.1939,
"step": 70
},
{
"epoch": 0.005550469766321368,
"grad_norm": 0.19907328486442566,
"learning_rate": 0.000142,
"loss": 1.2061,
"step": 72
},
{
"epoch": 0.005704649482052518,
"grad_norm": 0.18879663944244385,
"learning_rate": 0.000146,
"loss": 1.2149,
"step": 74
},
{
"epoch": 0.0058588291977836665,
"grad_norm": 0.21456782519817352,
"learning_rate": 0.00015000000000000001,
"loss": 1.1726,
"step": 76
},
{
"epoch": 0.006013008913514816,
"grad_norm": 0.23913143575191498,
"learning_rate": 0.000154,
"loss": 1.148,
"step": 78
},
{
"epoch": 0.006167188629245965,
"grad_norm": 0.2148526906967163,
"learning_rate": 0.00015800000000000002,
"loss": 1.1925,
"step": 80
},
{
"epoch": 0.006321368344977114,
"grad_norm": 0.2392999231815338,
"learning_rate": 0.000162,
"loss": 1.1488,
"step": 82
},
{
"epoch": 0.006475548060708263,
"grad_norm": 0.16503232717514038,
"learning_rate": 0.000166,
"loss": 1.1555,
"step": 84
},
{
"epoch": 0.006629727776439412,
"grad_norm": 0.1844739466905594,
"learning_rate": 0.00017,
"loss": 1.1934,
"step": 86
},
{
"epoch": 0.006783907492170562,
"grad_norm": 0.23832857608795166,
"learning_rate": 0.000174,
"loss": 1.1129,
"step": 88
},
{
"epoch": 0.0069380872079017105,
"grad_norm": 0.8846365809440613,
"learning_rate": 0.00017800000000000002,
"loss": 1.1028,
"step": 90
},
{
"epoch": 0.007092266923632859,
"grad_norm": 0.187076598405838,
"learning_rate": 0.000182,
"loss": 1.1,
"step": 92
},
{
"epoch": 0.007246446639364009,
"grad_norm": 0.1795521378517151,
"learning_rate": 0.00018600000000000002,
"loss": 1.1478,
"step": 94
},
{
"epoch": 0.007400626355095158,
"grad_norm": 0.199871227145195,
"learning_rate": 0.00019,
"loss": 1.1223,
"step": 96
},
{
"epoch": 0.007554806070826307,
"grad_norm": 0.17832662165164948,
"learning_rate": 0.000194,
"loss": 1.0909,
"step": 98
},
{
"epoch": 0.007708985786557456,
"grad_norm": 0.17023932933807373,
"learning_rate": 0.00019800000000000002,
"loss": 1.1526,
"step": 100
},
{
"epoch": 0.007708985786557456,
"eval_loss": 1.1401352882385254,
"eval_runtime": 185.6269,
"eval_samples_per_second": 91.274,
"eval_steps_per_second": 1.428,
"step": 100
},
{
"epoch": 0.007863165502288605,
"grad_norm": 0.17429223656654358,
"learning_rate": 0.00019999484748557298,
"loss": 1.1597,
"step": 102
},
{
"epoch": 0.008017345218019754,
"grad_norm": 0.16158349812030792,
"learning_rate": 0.0001999845424567189,
"loss": 1.1297,
"step": 104
},
{
"epoch": 0.008171524933750904,
"grad_norm": 0.15818771719932556,
"learning_rate": 0.0001999742374278648,
"loss": 1.083,
"step": 106
},
{
"epoch": 0.008325704649482053,
"grad_norm": 0.1591726392507553,
"learning_rate": 0.00019996393239901073,
"loss": 1.086,
"step": 108
},
{
"epoch": 0.008479884365213202,
"grad_norm": 0.174184650182724,
"learning_rate": 0.00019995362737015664,
"loss": 1.0769,
"step": 110
},
{
"epoch": 0.00863406408094435,
"grad_norm": 0.15928815305233002,
"learning_rate": 0.00019994332234130258,
"loss": 1.1315,
"step": 112
},
{
"epoch": 0.0087882437966755,
"grad_norm": 0.19639264047145844,
"learning_rate": 0.0001999330173124485,
"loss": 1.1339,
"step": 114
},
{
"epoch": 0.00894242351240665,
"grad_norm": 0.1639835238456726,
"learning_rate": 0.0001999227122835944,
"loss": 1.0836,
"step": 116
},
{
"epoch": 0.009096603228137799,
"grad_norm": 0.18691964447498322,
"learning_rate": 0.00019991240725474033,
"loss": 1.2109,
"step": 118
},
{
"epoch": 0.009250782943868947,
"grad_norm": 0.188096821308136,
"learning_rate": 0.00019990210222588624,
"loss": 1.1778,
"step": 120
},
{
"epoch": 0.009404962659600096,
"grad_norm": 0.1527150571346283,
"learning_rate": 0.00019989179719703218,
"loss": 1.0977,
"step": 122
},
{
"epoch": 0.009559142375331245,
"grad_norm": 0.1705218255519867,
"learning_rate": 0.0001998814921681781,
"loss": 1.1333,
"step": 124
},
{
"epoch": 0.009713322091062395,
"grad_norm": 0.1888928860425949,
"learning_rate": 0.00019987118713932401,
"loss": 1.1843,
"step": 126
},
{
"epoch": 0.009867501806793544,
"grad_norm": 0.1778104603290558,
"learning_rate": 0.00019986088211046993,
"loss": 1.0766,
"step": 128
},
{
"epoch": 0.010021681522524693,
"grad_norm": 0.15807992219924927,
"learning_rate": 0.00019985057708161584,
"loss": 1.0449,
"step": 130
},
{
"epoch": 0.010175861238255842,
"grad_norm": 0.16706159710884094,
"learning_rate": 0.00019984027205276176,
"loss": 1.0644,
"step": 132
},
{
"epoch": 0.01033004095398699,
"grad_norm": 0.16455501317977905,
"learning_rate": 0.00019982996702390767,
"loss": 1.1479,
"step": 134
},
{
"epoch": 0.010484220669718141,
"grad_norm": 0.17258939146995544,
"learning_rate": 0.0001998196619950536,
"loss": 1.0614,
"step": 136
},
{
"epoch": 0.01063840038544929,
"grad_norm": 0.15501369535923004,
"learning_rate": 0.0001998093569661995,
"loss": 1.1045,
"step": 138
},
{
"epoch": 0.010792580101180439,
"grad_norm": 0.1534334272146225,
"learning_rate": 0.00019979905193734542,
"loss": 1.1035,
"step": 140
},
{
"epoch": 0.010946759816911587,
"grad_norm": 0.14120443165302277,
"learning_rate": 0.00019978874690849136,
"loss": 1.0618,
"step": 142
},
{
"epoch": 0.011100939532642736,
"grad_norm": 0.17808520793914795,
"learning_rate": 0.00019977844187963728,
"loss": 1.1687,
"step": 144
},
{
"epoch": 0.011255119248373887,
"grad_norm": 0.16697613894939423,
"learning_rate": 0.0001997681368507832,
"loss": 1.0979,
"step": 146
},
{
"epoch": 0.011409298964105035,
"grad_norm": 0.16491086781024933,
"learning_rate": 0.0001997578318219291,
"loss": 1.1219,
"step": 148
},
{
"epoch": 0.011563478679836184,
"grad_norm": 0.15342313051223755,
"learning_rate": 0.00019974752679307502,
"loss": 1.1169,
"step": 150
},
{
"epoch": 0.011717658395567333,
"grad_norm": 0.1539286971092224,
"learning_rate": 0.00019973722176422093,
"loss": 1.1288,
"step": 152
},
{
"epoch": 0.011871838111298482,
"grad_norm": 0.15605852007865906,
"learning_rate": 0.00019972691673536688,
"loss": 1.0445,
"step": 154
},
{
"epoch": 0.012026017827029632,
"grad_norm": 0.14324098825454712,
"learning_rate": 0.0001997166117065128,
"loss": 1.1309,
"step": 156
},
{
"epoch": 0.012180197542760781,
"grad_norm": 0.21045701205730438,
"learning_rate": 0.0001997063066776587,
"loss": 1.0946,
"step": 158
},
{
"epoch": 0.01233437725849193,
"grad_norm": 0.16019922494888306,
"learning_rate": 0.00019969600164880462,
"loss": 1.11,
"step": 160
},
{
"epoch": 0.012488556974223079,
"grad_norm": 0.15740078687667847,
"learning_rate": 0.00019968569661995054,
"loss": 1.112,
"step": 162
},
{
"epoch": 0.012642736689954227,
"grad_norm": 0.16974380612373352,
"learning_rate": 0.00019967539159109648,
"loss": 1.1279,
"step": 164
},
{
"epoch": 0.012796916405685378,
"grad_norm": 0.16405288875102997,
"learning_rate": 0.0001996650865622424,
"loss": 1.0952,
"step": 166
},
{
"epoch": 0.012951096121416527,
"grad_norm": 0.16120509803295135,
"learning_rate": 0.0001996547815333883,
"loss": 1.1203,
"step": 168
},
{
"epoch": 0.013105275837147675,
"grad_norm": 0.17402276396751404,
"learning_rate": 0.00019964447650453422,
"loss": 1.0991,
"step": 170
},
{
"epoch": 0.013259455552878824,
"grad_norm": 0.18349111080169678,
"learning_rate": 0.00019963417147568014,
"loss": 1.1394,
"step": 172
},
{
"epoch": 0.013413635268609973,
"grad_norm": 0.14613087475299835,
"learning_rate": 0.00019962386644682608,
"loss": 1.1357,
"step": 174
},
{
"epoch": 0.013567814984341123,
"grad_norm": 0.142988383769989,
"learning_rate": 0.000199613561417972,
"loss": 1.0169,
"step": 176
},
{
"epoch": 0.013721994700072272,
"grad_norm": 0.14817160367965698,
"learning_rate": 0.0001996032563891179,
"loss": 1.1238,
"step": 178
},
{
"epoch": 0.013876174415803421,
"grad_norm": 0.15391133725643158,
"learning_rate": 0.00019959295136026382,
"loss": 1.0712,
"step": 180
},
{
"epoch": 0.01403035413153457,
"grad_norm": 0.1766846477985382,
"learning_rate": 0.00019958264633140974,
"loss": 1.1422,
"step": 182
},
{
"epoch": 0.014184533847265719,
"grad_norm": 0.16789212822914124,
"learning_rate": 0.00019957234130255565,
"loss": 1.1266,
"step": 184
},
{
"epoch": 0.014338713562996869,
"grad_norm": 0.1527165323495865,
"learning_rate": 0.00019956203627370157,
"loss": 1.0667,
"step": 186
},
{
"epoch": 0.014492893278728018,
"grad_norm": 0.1772206574678421,
"learning_rate": 0.00019955173124484748,
"loss": 1.1182,
"step": 188
},
{
"epoch": 0.014647072994459167,
"grad_norm": 0.15008313953876495,
"learning_rate": 0.0001995414262159934,
"loss": 1.0382,
"step": 190
},
{
"epoch": 0.014801252710190315,
"grad_norm": 0.16365988552570343,
"learning_rate": 0.00019953112118713931,
"loss": 1.1262,
"step": 192
},
{
"epoch": 0.014955432425921464,
"grad_norm": 0.14952193200588226,
"learning_rate": 0.00019952081615828526,
"loss": 1.1245,
"step": 194
},
{
"epoch": 0.015109612141652615,
"grad_norm": 0.15425263345241547,
"learning_rate": 0.00019951051112943117,
"loss": 1.1452,
"step": 196
},
{
"epoch": 0.015263791857383763,
"grad_norm": 0.1567617654800415,
"learning_rate": 0.00019950020610057709,
"loss": 1.0392,
"step": 198
},
{
"epoch": 0.015417971573114912,
"grad_norm": 0.14292609691619873,
"learning_rate": 0.000199489901071723,
"loss": 1.0728,
"step": 200
},
{
"epoch": 0.015417971573114912,
"eval_loss": 1.1127630472183228,
"eval_runtime": 185.2528,
"eval_samples_per_second": 91.459,
"eval_steps_per_second": 1.43,
"step": 200
},
{
"epoch": 0.015572151288846061,
"grad_norm": 0.15465517342090607,
"learning_rate": 0.00019947959604286892,
"loss": 1.0596,
"step": 202
},
{
"epoch": 0.01572633100457721,
"grad_norm": 0.16749607026576996,
"learning_rate": 0.00019946929101401486,
"loss": 1.1005,
"step": 204
},
{
"epoch": 0.01588051072030836,
"grad_norm": 0.15854287147521973,
"learning_rate": 0.00019945898598516077,
"loss": 1.0963,
"step": 206
},
{
"epoch": 0.016034690436039507,
"grad_norm": 0.1457831859588623,
"learning_rate": 0.0001994486809563067,
"loss": 1.1149,
"step": 208
},
{
"epoch": 0.016188870151770656,
"grad_norm": 0.15744629502296448,
"learning_rate": 0.0001994383759274526,
"loss": 1.0789,
"step": 210
},
{
"epoch": 0.01634304986750181,
"grad_norm": 0.13411423563957214,
"learning_rate": 0.00019942807089859852,
"loss": 1.0641,
"step": 212
},
{
"epoch": 0.016497229583232957,
"grad_norm": 0.1575399488210678,
"learning_rate": 0.00019941776586974446,
"loss": 1.0888,
"step": 214
},
{
"epoch": 0.016651409298964106,
"grad_norm": 0.14619529247283936,
"learning_rate": 0.00019940746084089037,
"loss": 1.081,
"step": 216
},
{
"epoch": 0.016805589014695255,
"grad_norm": 0.15578237175941467,
"learning_rate": 0.0001993971558120363,
"loss": 1.1434,
"step": 218
},
{
"epoch": 0.016959768730426403,
"grad_norm": 0.1516629308462143,
"learning_rate": 0.0001993868507831822,
"loss": 1.0909,
"step": 220
},
{
"epoch": 0.017113948446157552,
"grad_norm": 0.15613436698913574,
"learning_rate": 0.00019937654575432812,
"loss": 1.0999,
"step": 222
},
{
"epoch": 0.0172681281618887,
"grad_norm": 0.14825573563575745,
"learning_rate": 0.00019936624072547406,
"loss": 1.0827,
"step": 224
},
{
"epoch": 0.01742230787761985,
"grad_norm": 0.1624906212091446,
"learning_rate": 0.00019935593569661998,
"loss": 1.0856,
"step": 226
},
{
"epoch": 0.017576487593351,
"grad_norm": 0.1380940079689026,
"learning_rate": 0.0001993456306677659,
"loss": 1.0514,
"step": 228
},
{
"epoch": 0.017730667309082147,
"grad_norm": 0.13712120056152344,
"learning_rate": 0.0001993353256389118,
"loss": 1.0977,
"step": 230
},
{
"epoch": 0.0178848470248133,
"grad_norm": 0.1448957622051239,
"learning_rate": 0.00019932502061005772,
"loss": 1.0729,
"step": 232
},
{
"epoch": 0.01803902674054445,
"grad_norm": 0.13421876728534698,
"learning_rate": 0.00019931471558120364,
"loss": 1.0879,
"step": 234
},
{
"epoch": 0.018193206456275597,
"grad_norm": 0.16884732246398926,
"learning_rate": 0.00019930441055234955,
"loss": 1.1159,
"step": 236
},
{
"epoch": 0.018347386172006746,
"grad_norm": 0.14634890854358673,
"learning_rate": 0.00019929410552349547,
"loss": 1.0568,
"step": 238
},
{
"epoch": 0.018501565887737895,
"grad_norm": 0.16796648502349854,
"learning_rate": 0.00019928380049464138,
"loss": 1.0944,
"step": 240
},
{
"epoch": 0.018655745603469043,
"grad_norm": 0.13724717497825623,
"learning_rate": 0.0001992734954657873,
"loss": 1.0609,
"step": 242
},
{
"epoch": 0.018809925319200192,
"grad_norm": 0.14133594930171967,
"learning_rate": 0.0001992631904369332,
"loss": 1.0879,
"step": 244
},
{
"epoch": 0.01896410503493134,
"grad_norm": 0.1611246019601822,
"learning_rate": 0.00019925288540807915,
"loss": 1.0681,
"step": 246
},
{
"epoch": 0.01911828475066249,
"grad_norm": 0.17420877516269684,
"learning_rate": 0.00019924258037922507,
"loss": 1.1336,
"step": 248
},
{
"epoch": 0.01927246446639364,
"grad_norm": 0.13766029477119446,
"learning_rate": 0.00019923227535037098,
"loss": 1.075,
"step": 250
},
{
"epoch": 0.01942664418212479,
"grad_norm": 0.1691662222146988,
"learning_rate": 0.0001992219703215169,
"loss": 1.1369,
"step": 252
},
{
"epoch": 0.01958082389785594,
"grad_norm": 0.14959432184696198,
"learning_rate": 0.0001992116652926628,
"loss": 1.1129,
"step": 254
},
{
"epoch": 0.01973500361358709,
"grad_norm": 0.14996406435966492,
"learning_rate": 0.00019920136026380875,
"loss": 1.0304,
"step": 256
},
{
"epoch": 0.019889183329318237,
"grad_norm": 0.13211801648139954,
"learning_rate": 0.00019919105523495467,
"loss": 1.0652,
"step": 258
},
{
"epoch": 0.020043363045049386,
"grad_norm": 0.16041967272758484,
"learning_rate": 0.00019918075020610058,
"loss": 1.077,
"step": 260
},
{
"epoch": 0.020197542760780535,
"grad_norm": 0.1524546593427658,
"learning_rate": 0.0001991704451772465,
"loss": 1.1176,
"step": 262
},
{
"epoch": 0.020351722476511683,
"grad_norm": 0.16032540798187256,
"learning_rate": 0.00019916014014839241,
"loss": 1.0736,
"step": 264
},
{
"epoch": 0.020505902192242832,
"grad_norm": 0.17891019582748413,
"learning_rate": 0.00019914983511953836,
"loss": 1.1435,
"step": 266
},
{
"epoch": 0.02066008190797398,
"grad_norm": 0.14484059810638428,
"learning_rate": 0.00019913953009068427,
"loss": 1.0356,
"step": 268
},
{
"epoch": 0.02081426162370513,
"grad_norm": 0.14321155846118927,
"learning_rate": 0.00019912922506183019,
"loss": 1.0536,
"step": 270
},
{
"epoch": 0.020968441339436282,
"grad_norm": 0.17357808351516724,
"learning_rate": 0.0001991189200329761,
"loss": 1.171,
"step": 272
},
{
"epoch": 0.02112262105516743,
"grad_norm": 0.13990800082683563,
"learning_rate": 0.00019910861500412202,
"loss": 1.0946,
"step": 274
},
{
"epoch": 0.02127680077089858,
"grad_norm": 0.16634231805801392,
"learning_rate": 0.00019909830997526796,
"loss": 1.1029,
"step": 276
},
{
"epoch": 0.02143098048662973,
"grad_norm": 0.16322381794452667,
"learning_rate": 0.00019908800494641387,
"loss": 1.0688,
"step": 278
},
{
"epoch": 0.021585160202360877,
"grad_norm": 0.1652844250202179,
"learning_rate": 0.0001990776999175598,
"loss": 1.1237,
"step": 280
},
{
"epoch": 0.021739339918092026,
"grad_norm": 0.14457885921001434,
"learning_rate": 0.0001990673948887057,
"loss": 1.1995,
"step": 282
},
{
"epoch": 0.021893519633823175,
"grad_norm": 0.15549878776073456,
"learning_rate": 0.00019905708985985162,
"loss": 1.0475,
"step": 284
},
{
"epoch": 0.022047699349554323,
"grad_norm": 0.15715502202510834,
"learning_rate": 0.00019904678483099756,
"loss": 1.1211,
"step": 286
},
{
"epoch": 0.022201879065285472,
"grad_norm": 0.14022529125213623,
"learning_rate": 0.00019903647980214347,
"loss": 1.1056,
"step": 288
},
{
"epoch": 0.02235605878101662,
"grad_norm": 0.13293786346912384,
"learning_rate": 0.0001990261747732894,
"loss": 1.0877,
"step": 290
},
{
"epoch": 0.022510238496747773,
"grad_norm": 0.14625073969364166,
"learning_rate": 0.0001990158697444353,
"loss": 1.0375,
"step": 292
},
{
"epoch": 0.022664418212478922,
"grad_norm": 0.1417943835258484,
"learning_rate": 0.0001990055647155812,
"loss": 1.091,
"step": 294
},
{
"epoch": 0.02281859792821007,
"grad_norm": 0.1519964039325714,
"learning_rate": 0.00019899525968672713,
"loss": 1.0396,
"step": 296
},
{
"epoch": 0.02297277764394122,
"grad_norm": 0.1676655411720276,
"learning_rate": 0.00019898495465787305,
"loss": 1.1249,
"step": 298
},
{
"epoch": 0.02312695735967237,
"grad_norm": 0.1487220674753189,
"learning_rate": 0.00019897464962901896,
"loss": 1.1768,
"step": 300
},
{
"epoch": 0.02312695735967237,
"eval_loss": 1.1061022281646729,
"eval_runtime": 185.239,
"eval_samples_per_second": 91.466,
"eval_steps_per_second": 1.431,
"step": 300
},
{
"epoch": 0.023281137075403517,
"grad_norm": 0.1399739533662796,
"learning_rate": 0.00019896434460016488,
"loss": 1.0962,
"step": 302
},
{
"epoch": 0.023435316791134666,
"grad_norm": 0.15282337367534637,
"learning_rate": 0.0001989540395713108,
"loss": 1.1688,
"step": 304
},
{
"epoch": 0.023589496506865815,
"grad_norm": 0.15459619462490082,
"learning_rate": 0.00019894373454245674,
"loss": 1.0216,
"step": 306
},
{
"epoch": 0.023743676222596963,
"grad_norm": 0.15799634158611298,
"learning_rate": 0.00019893342951360265,
"loss": 1.1429,
"step": 308
},
{
"epoch": 0.023897855938328112,
"grad_norm": 0.1343819946050644,
"learning_rate": 0.00019892312448474857,
"loss": 1.0959,
"step": 310
},
{
"epoch": 0.024052035654059264,
"grad_norm": 0.14791317284107208,
"learning_rate": 0.00019891281945589448,
"loss": 1.0636,
"step": 312
},
{
"epoch": 0.024206215369790413,
"grad_norm": 0.1442137360572815,
"learning_rate": 0.0001989025144270404,
"loss": 1.055,
"step": 314
},
{
"epoch": 0.024360395085521562,
"grad_norm": 0.14649145305156708,
"learning_rate": 0.00019889220939818634,
"loss": 1.0906,
"step": 316
},
{
"epoch": 0.02451457480125271,
"grad_norm": 0.14234665036201477,
"learning_rate": 0.00019888190436933225,
"loss": 1.0853,
"step": 318
},
{
"epoch": 0.02466875451698386,
"grad_norm": 0.1419668048620224,
"learning_rate": 0.00019887159934047817,
"loss": 1.0296,
"step": 320
},
{
"epoch": 0.02482293423271501,
"grad_norm": 0.14730845391750336,
"learning_rate": 0.00019886129431162408,
"loss": 1.0421,
"step": 322
},
{
"epoch": 0.024977113948446157,
"grad_norm": 0.1400081068277359,
"learning_rate": 0.00019885098928277,
"loss": 1.0291,
"step": 324
},
{
"epoch": 0.025131293664177306,
"grad_norm": 0.15542668104171753,
"learning_rate": 0.0001988406842539159,
"loss": 1.0597,
"step": 326
},
{
"epoch": 0.025285473379908455,
"grad_norm": 0.14521440863609314,
"learning_rate": 0.00019883037922506185,
"loss": 1.0491,
"step": 328
},
{
"epoch": 0.025439653095639603,
"grad_norm": 0.16224826872348785,
"learning_rate": 0.00019882007419620777,
"loss": 1.1031,
"step": 330
},
{
"epoch": 0.025593832811370756,
"grad_norm": 0.15028877556324005,
"learning_rate": 0.00019880976916735368,
"loss": 1.1154,
"step": 332
},
{
"epoch": 0.025748012527101904,
"grad_norm": 0.12962941825389862,
"learning_rate": 0.0001987994641384996,
"loss": 1.0363,
"step": 334
},
{
"epoch": 0.025902192242833053,
"grad_norm": 0.14908359944820404,
"learning_rate": 0.0001987891591096455,
"loss": 1.1513,
"step": 336
},
{
"epoch": 0.026056371958564202,
"grad_norm": 0.15441828966140747,
"learning_rate": 0.00019877885408079146,
"loss": 1.1303,
"step": 338
},
{
"epoch": 0.02621055167429535,
"grad_norm": 0.12669101357460022,
"learning_rate": 0.00019876854905193737,
"loss": 1.0875,
"step": 340
},
{
"epoch": 0.0263647313900265,
"grad_norm": 0.13190661370754242,
"learning_rate": 0.00019875824402308329,
"loss": 1.0778,
"step": 342
},
{
"epoch": 0.02651891110575765,
"grad_norm": 0.14043989777565002,
"learning_rate": 0.0001987479389942292,
"loss": 1.1011,
"step": 344
},
{
"epoch": 0.026673090821488797,
"grad_norm": 0.13694870471954346,
"learning_rate": 0.00019873763396537512,
"loss": 1.0532,
"step": 346
},
{
"epoch": 0.026827270537219946,
"grad_norm": 0.15089921653270721,
"learning_rate": 0.00019872732893652103,
"loss": 1.1292,
"step": 348
},
{
"epoch": 0.026981450252951095,
"grad_norm": 0.14839838445186615,
"learning_rate": 0.00019871702390766694,
"loss": 1.0275,
"step": 350
},
{
"epoch": 0.027135629968682247,
"grad_norm": 0.16198500990867615,
"learning_rate": 0.00019870671887881286,
"loss": 1.1453,
"step": 352
},
{
"epoch": 0.027289809684413396,
"grad_norm": 0.14694632589817047,
"learning_rate": 0.00019869641384995877,
"loss": 1.129,
"step": 354
},
{
"epoch": 0.027443989400144544,
"grad_norm": 0.16091379523277283,
"learning_rate": 0.0001986861088211047,
"loss": 1.1186,
"step": 356
},
{
"epoch": 0.027598169115875693,
"grad_norm": 0.144720658659935,
"learning_rate": 0.00019867580379225063,
"loss": 1.0224,
"step": 358
},
{
"epoch": 0.027752348831606842,
"grad_norm": 0.13851307332515717,
"learning_rate": 0.00019866549876339655,
"loss": 1.1421,
"step": 360
},
{
"epoch": 0.02790652854733799,
"grad_norm": 0.13124969601631165,
"learning_rate": 0.00019865519373454246,
"loss": 1.0938,
"step": 362
},
{
"epoch": 0.02806070826306914,
"grad_norm": 0.14723828434944153,
"learning_rate": 0.00019864488870568838,
"loss": 1.1335,
"step": 364
},
{
"epoch": 0.02821488797880029,
"grad_norm": 0.17669795453548431,
"learning_rate": 0.0001986345836768343,
"loss": 1.0765,
"step": 366
},
{
"epoch": 0.028369067694531437,
"grad_norm": 0.1457260102033615,
"learning_rate": 0.00019862427864798023,
"loss": 1.1073,
"step": 368
},
{
"epoch": 0.028523247410262586,
"grad_norm": 0.13594554364681244,
"learning_rate": 0.00019861397361912615,
"loss": 1.0587,
"step": 370
},
{
"epoch": 0.028677427125993738,
"grad_norm": 0.13798941671848297,
"learning_rate": 0.00019860366859027206,
"loss": 1.0833,
"step": 372
},
{
"epoch": 0.028831606841724887,
"grad_norm": 0.15587519109249115,
"learning_rate": 0.00019859336356141798,
"loss": 1.0287,
"step": 374
},
{
"epoch": 0.028985786557456036,
"grad_norm": 0.16585086286067963,
"learning_rate": 0.0001985830585325639,
"loss": 1.1786,
"step": 376
},
{
"epoch": 0.029139966273187184,
"grad_norm": 0.1444484293460846,
"learning_rate": 0.00019857275350370983,
"loss": 1.1793,
"step": 378
},
{
"epoch": 0.029294145988918333,
"grad_norm": 0.14413981139659882,
"learning_rate": 0.00019856244847485575,
"loss": 1.1141,
"step": 380
},
{
"epoch": 0.029448325704649482,
"grad_norm": 0.142032191157341,
"learning_rate": 0.00019855214344600166,
"loss": 1.1033,
"step": 382
},
{
"epoch": 0.02960250542038063,
"grad_norm": 0.1490195393562317,
"learning_rate": 0.00019854183841714758,
"loss": 1.1592,
"step": 384
},
{
"epoch": 0.02975668513611178,
"grad_norm": 0.1408643275499344,
"learning_rate": 0.0001985315333882935,
"loss": 1.1505,
"step": 386
},
{
"epoch": 0.02991086485184293,
"grad_norm": 0.12526237964630127,
"learning_rate": 0.00019852122835943944,
"loss": 1.1027,
"step": 388
},
{
"epoch": 0.030065044567574077,
"grad_norm": 0.1339711844921112,
"learning_rate": 0.00019851092333058535,
"loss": 1.1238,
"step": 390
},
{
"epoch": 0.03021922428330523,
"grad_norm": 0.13032345473766327,
"learning_rate": 0.00019850061830173127,
"loss": 1.1121,
"step": 392
},
{
"epoch": 0.030373403999036378,
"grad_norm": 0.15815846621990204,
"learning_rate": 0.00019849031327287718,
"loss": 1.168,
"step": 394
},
{
"epoch": 0.030527583714767527,
"grad_norm": 0.14245116710662842,
"learning_rate": 0.0001984800082440231,
"loss": 1.0436,
"step": 396
},
{
"epoch": 0.030681763430498676,
"grad_norm": 0.15660050511360168,
"learning_rate": 0.000198469703215169,
"loss": 1.158,
"step": 398
},
{
"epoch": 0.030835943146229824,
"grad_norm": 0.1654158979654312,
"learning_rate": 0.00019845939818631493,
"loss": 1.0802,
"step": 400
},
{
"epoch": 0.030835943146229824,
"eval_loss": 1.1026971340179443,
"eval_runtime": 185.7295,
"eval_samples_per_second": 91.224,
"eval_steps_per_second": 1.427,
"step": 400
},
{
"epoch": 0.030990122861960973,
"grad_norm": 0.13845407962799072,
"learning_rate": 0.00019844909315746084,
"loss": 1.1055,
"step": 402
},
{
"epoch": 0.031144302577692122,
"grad_norm": 0.14852891862392426,
"learning_rate": 0.00019843878812860676,
"loss": 1.0983,
"step": 404
},
{
"epoch": 0.031298482293423274,
"grad_norm": 0.13408593833446503,
"learning_rate": 0.00019842848309975267,
"loss": 1.1063,
"step": 406
},
{
"epoch": 0.03145266200915442,
"grad_norm": 0.14041072130203247,
"learning_rate": 0.00019841817807089859,
"loss": 1.0327,
"step": 408
},
{
"epoch": 0.03160684172488557,
"grad_norm": 0.16119754314422607,
"learning_rate": 0.00019840787304204453,
"loss": 1.1,
"step": 410
},
{
"epoch": 0.03176102144061672,
"grad_norm": 0.14471223950386047,
"learning_rate": 0.00019839756801319044,
"loss": 1.0783,
"step": 412
},
{
"epoch": 0.03191520115634787,
"grad_norm": 0.15591050684452057,
"learning_rate": 0.00019838726298433636,
"loss": 1.1782,
"step": 414
},
{
"epoch": 0.032069380872079015,
"grad_norm": 0.1766556203365326,
"learning_rate": 0.00019837695795548227,
"loss": 1.1063,
"step": 416
},
{
"epoch": 0.03222356058781017,
"grad_norm": 0.16078630089759827,
"learning_rate": 0.0001983666529266282,
"loss": 1.0891,
"step": 418
},
{
"epoch": 0.03237774030354131,
"grad_norm": 0.13378402590751648,
"learning_rate": 0.00019835634789777413,
"loss": 1.074,
"step": 420
},
{
"epoch": 0.032531920019272464,
"grad_norm": 0.14526261389255524,
"learning_rate": 0.00019834604286892004,
"loss": 1.108,
"step": 422
},
{
"epoch": 0.03268609973500362,
"grad_norm": 0.1321713775396347,
"learning_rate": 0.00019833573784006596,
"loss": 1.019,
"step": 424
},
{
"epoch": 0.03284027945073476,
"grad_norm": 0.12685374915599823,
"learning_rate": 0.00019832543281121187,
"loss": 1.09,
"step": 426
},
{
"epoch": 0.032994459166465914,
"grad_norm": 0.13825605809688568,
"learning_rate": 0.0001983151277823578,
"loss": 1.1356,
"step": 428
},
{
"epoch": 0.03314863888219706,
"grad_norm": 0.13683827221393585,
"learning_rate": 0.00019830482275350373,
"loss": 1.1405,
"step": 430
},
{
"epoch": 0.03330281859792821,
"grad_norm": 0.16707143187522888,
"learning_rate": 0.00019829451772464965,
"loss": 1.1305,
"step": 432
},
{
"epoch": 0.03345699831365936,
"grad_norm": 0.11735045164823532,
"learning_rate": 0.00019828421269579556,
"loss": 1.0421,
"step": 434
},
{
"epoch": 0.03361117802939051,
"grad_norm": 0.1337989866733551,
"learning_rate": 0.00019827390766694148,
"loss": 1.0572,
"step": 436
},
{
"epoch": 0.033765357745121655,
"grad_norm": 0.17111611366271973,
"learning_rate": 0.0001982636026380874,
"loss": 1.1698,
"step": 438
},
{
"epoch": 0.03391953746085281,
"grad_norm": 0.13785259425640106,
"learning_rate": 0.00019825329760923333,
"loss": 1.056,
"step": 440
},
{
"epoch": 0.03407371717658395,
"grad_norm": 0.15061460435390472,
"learning_rate": 0.00019824299258037925,
"loss": 1.0963,
"step": 442
},
{
"epoch": 0.034227896892315104,
"grad_norm": 0.1231001690030098,
"learning_rate": 0.00019823268755152516,
"loss": 1.1264,
"step": 444
},
{
"epoch": 0.03438207660804626,
"grad_norm": 0.13752298057079315,
"learning_rate": 0.00019822238252267108,
"loss": 1.0672,
"step": 446
},
{
"epoch": 0.0345362563237774,
"grad_norm": 0.13519813120365143,
"learning_rate": 0.000198212077493817,
"loss": 1.0882,
"step": 448
},
{
"epoch": 0.034690436039508554,
"grad_norm": 0.140150785446167,
"learning_rate": 0.0001982017724649629,
"loss": 1.0572,
"step": 450
},
{
"epoch": 0.0348446157552397,
"grad_norm": 0.13910406827926636,
"learning_rate": 0.00019819146743610882,
"loss": 1.0762,
"step": 452
},
{
"epoch": 0.03499879547097085,
"grad_norm": 0.14587442576885223,
"learning_rate": 0.00019818116240725474,
"loss": 1.1232,
"step": 454
},
{
"epoch": 0.035152975186702,
"grad_norm": 0.14476893842220306,
"learning_rate": 0.00019817085737840065,
"loss": 1.1004,
"step": 456
},
{
"epoch": 0.03530715490243315,
"grad_norm": 0.13861101865768433,
"learning_rate": 0.00019816055234954657,
"loss": 1.0302,
"step": 458
},
{
"epoch": 0.035461334618164295,
"grad_norm": 0.14342686533927917,
"learning_rate": 0.0001981502473206925,
"loss": 1.1092,
"step": 460
},
{
"epoch": 0.03561551433389545,
"grad_norm": 0.11709775030612946,
"learning_rate": 0.00019813994229183842,
"loss": 1.0463,
"step": 462
},
{
"epoch": 0.0357696940496266,
"grad_norm": 0.15154917538166046,
"learning_rate": 0.00019812963726298434,
"loss": 1.0897,
"step": 464
},
{
"epoch": 0.035923873765357744,
"grad_norm": 0.16716259717941284,
"learning_rate": 0.00019811933223413025,
"loss": 1.1214,
"step": 466
},
{
"epoch": 0.0360780534810889,
"grad_norm": 0.13513320684432983,
"learning_rate": 0.00019810902720527617,
"loss": 1.0623,
"step": 468
},
{
"epoch": 0.03623223319682004,
"grad_norm": 0.15930432081222534,
"learning_rate": 0.0001980987221764221,
"loss": 1.1092,
"step": 470
},
{
"epoch": 0.036386412912551194,
"grad_norm": 0.13990509510040283,
"learning_rate": 0.00019808841714756803,
"loss": 1.1048,
"step": 472
},
{
"epoch": 0.03654059262828234,
"grad_norm": 0.18784300982952118,
"learning_rate": 0.00019807811211871394,
"loss": 1.1676,
"step": 474
},
{
"epoch": 0.03669477234401349,
"grad_norm": 0.152045339345932,
"learning_rate": 0.00019806780708985986,
"loss": 1.1303,
"step": 476
},
{
"epoch": 0.03684895205974464,
"grad_norm": 0.1409967988729477,
"learning_rate": 0.00019805750206100577,
"loss": 1.0972,
"step": 478
},
{
"epoch": 0.03700313177547579,
"grad_norm": 0.13838854432106018,
"learning_rate": 0.0001980471970321517,
"loss": 1.101,
"step": 480
},
{
"epoch": 0.037157311491206935,
"grad_norm": 0.1579430103302002,
"learning_rate": 0.00019803689200329763,
"loss": 1.1077,
"step": 482
},
{
"epoch": 0.03731149120693809,
"grad_norm": 0.15061910450458527,
"learning_rate": 0.00019802658697444354,
"loss": 1.1239,
"step": 484
},
{
"epoch": 0.03746567092266924,
"grad_norm": 0.16408291459083557,
"learning_rate": 0.00019801628194558946,
"loss": 1.0961,
"step": 486
},
{
"epoch": 0.037619850638400384,
"grad_norm": 0.15612424910068512,
"learning_rate": 0.00019800597691673537,
"loss": 1.1299,
"step": 488
},
{
"epoch": 0.03777403035413154,
"grad_norm": 0.14135530591011047,
"learning_rate": 0.00019799567188788131,
"loss": 1.0489,
"step": 490
},
{
"epoch": 0.03792821006986268,
"grad_norm": 0.13743548095226288,
"learning_rate": 0.00019798536685902723,
"loss": 1.0837,
"step": 492
},
{
"epoch": 0.038082389785593834,
"grad_norm": 0.157401442527771,
"learning_rate": 0.00019797506183017314,
"loss": 1.0573,
"step": 494
},
{
"epoch": 0.03823656950132498,
"grad_norm": 0.14982052147388458,
"learning_rate": 0.00019796475680131906,
"loss": 1.0839,
"step": 496
},
{
"epoch": 0.03839074921705613,
"grad_norm": 0.1347000151872635,
"learning_rate": 0.00019795445177246497,
"loss": 1.113,
"step": 498
},
{
"epoch": 0.03854492893278728,
"grad_norm": 0.14478904008865356,
"learning_rate": 0.0001979441467436109,
"loss": 1.0514,
"step": 500
},
{
"epoch": 0.03854492893278728,
"eval_loss": 1.1000746488571167,
"eval_runtime": 185.5217,
"eval_samples_per_second": 91.326,
"eval_steps_per_second": 1.428,
"step": 500
},
{
"epoch": 0.03869910864851843,
"grad_norm": 0.14274291694164276,
"learning_rate": 0.00019793384171475683,
"loss": 1.0847,
"step": 502
},
{
"epoch": 0.03885328836424958,
"grad_norm": 0.14326965808868408,
"learning_rate": 0.00019792353668590275,
"loss": 1.0865,
"step": 504
},
{
"epoch": 0.03900746807998073,
"grad_norm": 0.1575518548488617,
"learning_rate": 0.00019791323165704866,
"loss": 1.1258,
"step": 506
},
{
"epoch": 0.03916164779571188,
"grad_norm": 0.14699862897396088,
"learning_rate": 0.00019790292662819458,
"loss": 1.1687,
"step": 508
},
{
"epoch": 0.039315827511443024,
"grad_norm": 0.1394687294960022,
"learning_rate": 0.0001978926215993405,
"loss": 1.1214,
"step": 510
},
{
"epoch": 0.03947000722717418,
"grad_norm": 0.14366985857486725,
"learning_rate": 0.0001978823165704864,
"loss": 1.0651,
"step": 512
},
{
"epoch": 0.03962418694290532,
"grad_norm": 0.14171218872070312,
"learning_rate": 0.00019787201154163232,
"loss": 1.1398,
"step": 514
},
{
"epoch": 0.039778366658636474,
"grad_norm": 0.13258612155914307,
"learning_rate": 0.00019786170651277824,
"loss": 1.1234,
"step": 516
},
{
"epoch": 0.03993254637436762,
"grad_norm": 0.17693160474300385,
"learning_rate": 0.00019785140148392415,
"loss": 1.1121,
"step": 518
},
{
"epoch": 0.04008672609009877,
"grad_norm": 0.143838569521904,
"learning_rate": 0.00019784109645507006,
"loss": 1.102,
"step": 520
},
{
"epoch": 0.04024090580582992,
"grad_norm": 0.14078038930892944,
"learning_rate": 0.000197830791426216,
"loss": 1.1044,
"step": 522
},
{
"epoch": 0.04039508552156107,
"grad_norm": 0.12367985397577286,
"learning_rate": 0.00019782048639736192,
"loss": 1.102,
"step": 524
},
{
"epoch": 0.04054926523729222,
"grad_norm": 0.136929452419281,
"learning_rate": 0.00019781018136850784,
"loss": 1.0802,
"step": 526
},
{
"epoch": 0.04070344495302337,
"grad_norm": 0.15831957757472992,
"learning_rate": 0.00019779987633965375,
"loss": 1.09,
"step": 528
},
{
"epoch": 0.04085762466875452,
"grad_norm": 0.15482452511787415,
"learning_rate": 0.00019778957131079967,
"loss": 1.0828,
"step": 530
},
{
"epoch": 0.041011804384485664,
"grad_norm": 0.13797122240066528,
"learning_rate": 0.0001977792662819456,
"loss": 1.1263,
"step": 532
},
{
"epoch": 0.04116598410021682,
"grad_norm": 0.18304814398288727,
"learning_rate": 0.00019776896125309152,
"loss": 1.0991,
"step": 534
},
{
"epoch": 0.04132016381594796,
"grad_norm": 0.1509987860918045,
"learning_rate": 0.00019775865622423744,
"loss": 1.0804,
"step": 536
},
{
"epoch": 0.041474343531679114,
"grad_norm": 0.13406258821487427,
"learning_rate": 0.00019774835119538335,
"loss": 1.0348,
"step": 538
},
{
"epoch": 0.04162852324741026,
"grad_norm": 0.1413736194372177,
"learning_rate": 0.00019773804616652927,
"loss": 1.066,
"step": 540
},
{
"epoch": 0.04178270296314141,
"grad_norm": 0.1451394259929657,
"learning_rate": 0.0001977277411376752,
"loss": 1.0485,
"step": 542
},
{
"epoch": 0.041936882678872564,
"grad_norm": 0.13275358080863953,
"learning_rate": 0.00019771743610882113,
"loss": 1.1164,
"step": 544
},
{
"epoch": 0.04209106239460371,
"grad_norm": 0.15869611501693726,
"learning_rate": 0.00019770713107996704,
"loss": 1.1361,
"step": 546
},
{
"epoch": 0.04224524211033486,
"grad_norm": 0.14091487228870392,
"learning_rate": 0.00019769682605111295,
"loss": 1.061,
"step": 548
},
{
"epoch": 0.04239942182606601,
"grad_norm": 0.13538867235183716,
"learning_rate": 0.00019768652102225887,
"loss": 1.0607,
"step": 550
},
{
"epoch": 0.04255360154179716,
"grad_norm": 0.15626317262649536,
"learning_rate": 0.0001976762159934048,
"loss": 1.0758,
"step": 552
},
{
"epoch": 0.042707781257528304,
"grad_norm": 0.1293731927871704,
"learning_rate": 0.00019766591096455073,
"loss": 1.0434,
"step": 554
},
{
"epoch": 0.04286196097325946,
"grad_norm": 0.13498535752296448,
"learning_rate": 0.00019765560593569664,
"loss": 1.0953,
"step": 556
},
{
"epoch": 0.0430161406889906,
"grad_norm": 0.14134527742862701,
"learning_rate": 0.00019764530090684256,
"loss": 1.1559,
"step": 558
},
{
"epoch": 0.043170320404721754,
"grad_norm": 0.13958705961704254,
"learning_rate": 0.00019763499587798847,
"loss": 1.2585,
"step": 560
},
{
"epoch": 0.0433245001204529,
"grad_norm": 0.2181047797203064,
"learning_rate": 0.0001976246908491344,
"loss": 1.0164,
"step": 562
},
{
"epoch": 0.04347867983618405,
"grad_norm": 0.1365436315536499,
"learning_rate": 0.0001976143858202803,
"loss": 1.124,
"step": 564
},
{
"epoch": 0.043632859551915204,
"grad_norm": 0.12809793651103973,
"learning_rate": 0.00019760408079142622,
"loss": 1.0378,
"step": 566
},
{
"epoch": 0.04378703926764635,
"grad_norm": 0.12341924756765366,
"learning_rate": 0.00019759377576257213,
"loss": 1.1091,
"step": 568
},
{
"epoch": 0.0439412189833775,
"grad_norm": 0.14291982352733612,
"learning_rate": 0.00019758347073371805,
"loss": 1.1366,
"step": 570
},
{
"epoch": 0.04409539869910865,
"grad_norm": 0.14486652612686157,
"learning_rate": 0.000197573165704864,
"loss": 1.0168,
"step": 572
},
{
"epoch": 0.0442495784148398,
"grad_norm": 0.1724916249513626,
"learning_rate": 0.0001975628606760099,
"loss": 1.1037,
"step": 574
},
{
"epoch": 0.044403758130570944,
"grad_norm": 0.13338427245616913,
"learning_rate": 0.00019755255564715582,
"loss": 1.0259,
"step": 576
},
{
"epoch": 0.0445579378463021,
"grad_norm": 0.1372508853673935,
"learning_rate": 0.00019754225061830173,
"loss": 1.0784,
"step": 578
},
{
"epoch": 0.04471211756203324,
"grad_norm": 0.11633725464344025,
"learning_rate": 0.00019753194558944765,
"loss": 1.0648,
"step": 580
},
{
"epoch": 0.044866297277764394,
"grad_norm": 0.14386776089668274,
"learning_rate": 0.00019752164056059356,
"loss": 1.0777,
"step": 582
},
{
"epoch": 0.045020476993495546,
"grad_norm": 0.14929193258285522,
"learning_rate": 0.0001975113355317395,
"loss": 1.1319,
"step": 584
},
{
"epoch": 0.04517465670922669,
"grad_norm": 0.1324220448732376,
"learning_rate": 0.00019750103050288542,
"loss": 1.0614,
"step": 586
},
{
"epoch": 0.045328836424957844,
"grad_norm": 0.1392926126718521,
"learning_rate": 0.00019749072547403133,
"loss": 1.142,
"step": 588
},
{
"epoch": 0.04548301614068899,
"grad_norm": 0.2632090151309967,
"learning_rate": 0.00019748042044517725,
"loss": 1.0159,
"step": 590
},
{
"epoch": 0.04563719585642014,
"grad_norm": 0.13699129223823547,
"learning_rate": 0.00019747011541632316,
"loss": 1.0778,
"step": 592
},
{
"epoch": 0.04579137557215129,
"grad_norm": 0.13768675923347473,
"learning_rate": 0.0001974598103874691,
"loss": 1.0719,
"step": 594
},
{
"epoch": 0.04594555528788244,
"grad_norm": 0.13458684086799622,
"learning_rate": 0.00019744950535861502,
"loss": 1.0145,
"step": 596
},
{
"epoch": 0.046099735003613584,
"grad_norm": 0.1772696077823639,
"learning_rate": 0.00019743920032976094,
"loss": 1.0629,
"step": 598
},
{
"epoch": 0.04625391471934474,
"grad_norm": 0.13998697698116302,
"learning_rate": 0.00019742889530090685,
"loss": 1.102,
"step": 600
},
{
"epoch": 0.04625391471934474,
"eval_loss": 1.098169207572937,
"eval_runtime": 185.5141,
"eval_samples_per_second": 91.33,
"eval_steps_per_second": 1.428,
"step": 600
},
{
"epoch": 0.04640809443507588,
"grad_norm": 0.13928066194057465,
"learning_rate": 0.00019741859027205277,
"loss": 1.1527,
"step": 602
},
{
"epoch": 0.046562274150807034,
"grad_norm": 0.13011601567268372,
"learning_rate": 0.0001974082852431987,
"loss": 1.1259,
"step": 604
},
{
"epoch": 0.046716453866538186,
"grad_norm": 0.1306074559688568,
"learning_rate": 0.00019739798021434462,
"loss": 1.0951,
"step": 606
},
{
"epoch": 0.04687063358226933,
"grad_norm": 0.14797037839889526,
"learning_rate": 0.00019738767518549054,
"loss": 1.0321,
"step": 608
},
{
"epoch": 0.047024813298000484,
"grad_norm": 0.14849938452243805,
"learning_rate": 0.00019737737015663645,
"loss": 1.1096,
"step": 610
},
{
"epoch": 0.04717899301373163,
"grad_norm": 0.12060682475566864,
"learning_rate": 0.00019736706512778237,
"loss": 1.0652,
"step": 612
},
{
"epoch": 0.04733317272946278,
"grad_norm": 0.12754854559898376,
"learning_rate": 0.00019735676009892828,
"loss": 1.1097,
"step": 614
},
{
"epoch": 0.04748735244519393,
"grad_norm": 0.12162326276302338,
"learning_rate": 0.0001973464550700742,
"loss": 1.1087,
"step": 616
},
{
"epoch": 0.04764153216092508,
"grad_norm": 0.175630122423172,
"learning_rate": 0.0001973361500412201,
"loss": 1.0723,
"step": 618
},
{
"epoch": 0.047795711876656224,
"grad_norm": 0.15365472435951233,
"learning_rate": 0.00019732584501236603,
"loss": 1.1009,
"step": 620
},
{
"epoch": 0.04794989159238738,
"grad_norm": 0.13359837234020233,
"learning_rate": 0.00019731553998351194,
"loss": 1.0974,
"step": 622
},
{
"epoch": 0.04810407130811853,
"grad_norm": 0.1482960432767868,
"learning_rate": 0.00019730523495465788,
"loss": 1.1214,
"step": 624
},
{
"epoch": 0.048258251023849674,
"grad_norm": 0.1309668868780136,
"learning_rate": 0.0001972949299258038,
"loss": 1.0849,
"step": 626
},
{
"epoch": 0.048412430739580826,
"grad_norm": 0.1544414609670639,
"learning_rate": 0.00019728462489694971,
"loss": 1.092,
"step": 628
},
{
"epoch": 0.04856661045531197,
"grad_norm": 0.14907146990299225,
"learning_rate": 0.00019727431986809563,
"loss": 1.0671,
"step": 630
},
{
"epoch": 0.048720790171043124,
"grad_norm": 0.16943813860416412,
"learning_rate": 0.00019726401483924154,
"loss": 1.1433,
"step": 632
},
{
"epoch": 0.04887496988677427,
"grad_norm": 0.14070230722427368,
"learning_rate": 0.00019725370981038749,
"loss": 1.1613,
"step": 634
},
{
"epoch": 0.04902914960250542,
"grad_norm": 0.15507204830646515,
"learning_rate": 0.0001972434047815334,
"loss": 1.1286,
"step": 636
},
{
"epoch": 0.04918332931823657,
"grad_norm": 0.13587893545627594,
"learning_rate": 0.00019723309975267932,
"loss": 1.1094,
"step": 638
},
{
"epoch": 0.04933750903396772,
"grad_norm": 0.12399852275848389,
"learning_rate": 0.00019722279472382523,
"loss": 1.058,
"step": 640
},
{
"epoch": 0.049491688749698864,
"grad_norm": 0.12497518211603165,
"learning_rate": 0.00019721248969497115,
"loss": 1.0716,
"step": 642
},
{
"epoch": 0.04964586846543002,
"grad_norm": 0.15282607078552246,
"learning_rate": 0.0001972021846661171,
"loss": 1.0912,
"step": 644
},
{
"epoch": 0.04980004818116117,
"grad_norm": 0.14203013479709625,
"learning_rate": 0.000197191879637263,
"loss": 1.0846,
"step": 646
},
{
"epoch": 0.049954227896892314,
"grad_norm": 0.12308704853057861,
"learning_rate": 0.00019718157460840892,
"loss": 1.1202,
"step": 648
},
{
"epoch": 0.050108407612623466,
"grad_norm": 0.15226681530475616,
"learning_rate": 0.00019717126957955483,
"loss": 1.0626,
"step": 650
},
{
"epoch": 0.05026258732835461,
"grad_norm": 0.12636694312095642,
"learning_rate": 0.00019716096455070075,
"loss": 1.1086,
"step": 652
},
{
"epoch": 0.050416767044085764,
"grad_norm": 0.14969666302204132,
"learning_rate": 0.0001971506595218467,
"loss": 1.1602,
"step": 654
},
{
"epoch": 0.05057094675981691,
"grad_norm": 0.130833700299263,
"learning_rate": 0.0001971403544929926,
"loss": 1.0657,
"step": 656
},
{
"epoch": 0.05072512647554806,
"grad_norm": 0.1283751279115677,
"learning_rate": 0.00019713004946413852,
"loss": 1.0371,
"step": 658
},
{
"epoch": 0.05087930619127921,
"grad_norm": 0.11827697604894638,
"learning_rate": 0.00019711974443528443,
"loss": 1.0308,
"step": 660
},
{
"epoch": 0.05103348590701036,
"grad_norm": 0.12265590578317642,
"learning_rate": 0.00019710943940643035,
"loss": 1.1127,
"step": 662
},
{
"epoch": 0.05118766562274151,
"grad_norm": 0.13979150354862213,
"learning_rate": 0.0001970991343775763,
"loss": 1.1011,
"step": 664
},
{
"epoch": 0.05134184533847266,
"grad_norm": 0.1368461698293686,
"learning_rate": 0.0001970888293487222,
"loss": 1.0857,
"step": 666
},
{
"epoch": 0.05149602505420381,
"grad_norm": 0.13669301569461823,
"learning_rate": 0.00019707852431986812,
"loss": 1.0971,
"step": 668
},
{
"epoch": 0.051650204769934954,
"grad_norm": 0.12659449875354767,
"learning_rate": 0.00019706821929101404,
"loss": 1.0556,
"step": 670
},
{
"epoch": 0.051804384485666106,
"grad_norm": 0.14103113114833832,
"learning_rate": 0.00019705791426215995,
"loss": 1.0913,
"step": 672
},
{
"epoch": 0.05195856420139725,
"grad_norm": 0.16134017705917358,
"learning_rate": 0.00019704760923330587,
"loss": 1.0994,
"step": 674
},
{
"epoch": 0.052112743917128404,
"grad_norm": 0.12725086510181427,
"learning_rate": 0.00019703730420445178,
"loss": 1.1008,
"step": 676
},
{
"epoch": 0.05226692363285955,
"grad_norm": 0.12865908443927765,
"learning_rate": 0.0001970269991755977,
"loss": 1.0186,
"step": 678
},
{
"epoch": 0.0524211033485907,
"grad_norm": 0.1661859154701233,
"learning_rate": 0.0001970166941467436,
"loss": 1.068,
"step": 680
},
{
"epoch": 0.05257528306432185,
"grad_norm": 0.14370663464069366,
"learning_rate": 0.00019700638911788953,
"loss": 1.102,
"step": 682
},
{
"epoch": 0.052729462780053,
"grad_norm": 0.13285204768180847,
"learning_rate": 0.00019699608408903544,
"loss": 1.1055,
"step": 684
},
{
"epoch": 0.05288364249578415,
"grad_norm": 0.17762747406959534,
"learning_rate": 0.00019698577906018138,
"loss": 1.1601,
"step": 686
},
{
"epoch": 0.0530378222115153,
"grad_norm": 0.12693317234516144,
"learning_rate": 0.0001969754740313273,
"loss": 1.0494,
"step": 688
},
{
"epoch": 0.05319200192724645,
"grad_norm": 0.1302707940340042,
"learning_rate": 0.0001969651690024732,
"loss": 1.066,
"step": 690
},
{
"epoch": 0.053346181642977594,
"grad_norm": 0.11844471096992493,
"learning_rate": 0.00019695486397361913,
"loss": 1.0085,
"step": 692
},
{
"epoch": 0.053500361358708746,
"grad_norm": 0.12299422174692154,
"learning_rate": 0.00019694455894476504,
"loss": 1.0985,
"step": 694
},
{
"epoch": 0.05365454107443989,
"grad_norm": 0.1222420409321785,
"learning_rate": 0.00019693425391591098,
"loss": 1.0648,
"step": 696
},
{
"epoch": 0.053808720790171044,
"grad_norm": 0.13273879885673523,
"learning_rate": 0.0001969239488870569,
"loss": 1.1108,
"step": 698
},
{
"epoch": 0.05396290050590219,
"grad_norm": 0.13202215731143951,
"learning_rate": 0.00019691364385820281,
"loss": 1.1013,
"step": 700
},
{
"epoch": 0.05396290050590219,
"eval_loss": 1.0964874029159546,
"eval_runtime": 185.3303,
"eval_samples_per_second": 91.421,
"eval_steps_per_second": 1.43,
"step": 700
},
{
"epoch": 0.05411708022163334,
"grad_norm": 0.13038010895252228,
"learning_rate": 0.00019690333882934873,
"loss": 1.0642,
"step": 702
},
{
"epoch": 0.054271259937364494,
"grad_norm": 0.18084144592285156,
"learning_rate": 0.00019689303380049464,
"loss": 1.0673,
"step": 704
},
{
"epoch": 0.05442543965309564,
"grad_norm": 0.18958036601543427,
"learning_rate": 0.00019688272877164059,
"loss": 1.0925,
"step": 706
},
{
"epoch": 0.05457961936882679,
"grad_norm": 0.13386841118335724,
"learning_rate": 0.0001968724237427865,
"loss": 1.0978,
"step": 708
},
{
"epoch": 0.05473379908455794,
"grad_norm": 0.1408504843711853,
"learning_rate": 0.00019686211871393242,
"loss": 1.1158,
"step": 710
},
{
"epoch": 0.05488797880028909,
"grad_norm": 0.12006545811891556,
"learning_rate": 0.00019685181368507833,
"loss": 1.0395,
"step": 712
},
{
"epoch": 0.055042158516020234,
"grad_norm": 0.13973191380500793,
"learning_rate": 0.00019684150865622425,
"loss": 1.0685,
"step": 714
},
{
"epoch": 0.055196338231751386,
"grad_norm": 0.14461107552051544,
"learning_rate": 0.0001968312036273702,
"loss": 1.0924,
"step": 716
},
{
"epoch": 0.05535051794748253,
"grad_norm": 0.13358595967292786,
"learning_rate": 0.0001968208985985161,
"loss": 1.0479,
"step": 718
},
{
"epoch": 0.055504697663213684,
"grad_norm": 0.13416843116283417,
"learning_rate": 0.00019681059356966202,
"loss": 1.0166,
"step": 720
},
{
"epoch": 0.05565887737894483,
"grad_norm": 0.15217959880828857,
"learning_rate": 0.00019680028854080793,
"loss": 1.0918,
"step": 722
},
{
"epoch": 0.05581305709467598,
"grad_norm": 0.13012762367725372,
"learning_rate": 0.00019678998351195385,
"loss": 1.0967,
"step": 724
},
{
"epoch": 0.055967236810407134,
"grad_norm": 0.13023535907268524,
"learning_rate": 0.00019677967848309976,
"loss": 1.0247,
"step": 726
},
{
"epoch": 0.05612141652613828,
"grad_norm": 0.13703665137290955,
"learning_rate": 0.00019676937345424568,
"loss": 1.0969,
"step": 728
},
{
"epoch": 0.05627559624186943,
"grad_norm": 0.12767066061496735,
"learning_rate": 0.0001967590684253916,
"loss": 1.08,
"step": 730
},
{
"epoch": 0.05642977595760058,
"grad_norm": 0.12238382548093796,
"learning_rate": 0.0001967487633965375,
"loss": 1.1233,
"step": 732
},
{
"epoch": 0.05658395567333173,
"grad_norm": 0.1356974095106125,
"learning_rate": 0.00019673845836768342,
"loss": 1.0439,
"step": 734
},
{
"epoch": 0.056738135389062874,
"grad_norm": 0.14199669659137726,
"learning_rate": 0.00019672815333882936,
"loss": 1.0753,
"step": 736
},
{
"epoch": 0.056892315104794026,
"grad_norm": 0.12904112040996552,
"learning_rate": 0.00019671784830997528,
"loss": 1.0749,
"step": 738
},
{
"epoch": 0.05704649482052517,
"grad_norm": 0.1235031932592392,
"learning_rate": 0.0001967075432811212,
"loss": 1.0275,
"step": 740
},
{
"epoch": 0.057200674536256324,
"grad_norm": 0.170023113489151,
"learning_rate": 0.0001966972382522671,
"loss": 1.1295,
"step": 742
},
{
"epoch": 0.057354854251987476,
"grad_norm": 0.15533532202243805,
"learning_rate": 0.00019668693322341302,
"loss": 1.0629,
"step": 744
},
{
"epoch": 0.05750903396771862,
"grad_norm": 0.1602126806974411,
"learning_rate": 0.00019667662819455897,
"loss": 1.1538,
"step": 746
},
{
"epoch": 0.057663213683449774,
"grad_norm": 0.16433580219745636,
"learning_rate": 0.00019666632316570488,
"loss": 1.1322,
"step": 748
},
{
"epoch": 0.05781739339918092,
"grad_norm": 0.13925233483314514,
"learning_rate": 0.0001966560181368508,
"loss": 1.083,
"step": 750
},
{
"epoch": 0.05797157311491207,
"grad_norm": 0.12234565615653992,
"learning_rate": 0.0001966457131079967,
"loss": 1.0113,
"step": 752
},
{
"epoch": 0.05812575283064322,
"grad_norm": 0.1425125002861023,
"learning_rate": 0.00019663540807914262,
"loss": 1.0762,
"step": 754
},
{
"epoch": 0.05827993254637437,
"grad_norm": 0.14309099316596985,
"learning_rate": 0.00019662510305028854,
"loss": 1.0633,
"step": 756
},
{
"epoch": 0.058434112262105514,
"grad_norm": 0.1381814330816269,
"learning_rate": 0.00019661479802143448,
"loss": 1.142,
"step": 758
},
{
"epoch": 0.058588291977836666,
"grad_norm": 0.15551595389842987,
"learning_rate": 0.0001966044929925804,
"loss": 1.026,
"step": 760
},
{
"epoch": 0.05874247169356781,
"grad_norm": 0.14606410264968872,
"learning_rate": 0.0001965941879637263,
"loss": 1.1265,
"step": 762
},
{
"epoch": 0.058896651409298964,
"grad_norm": 0.13017289340496063,
"learning_rate": 0.00019658388293487223,
"loss": 1.1051,
"step": 764
},
{
"epoch": 0.059050831125030116,
"grad_norm": 0.1500990092754364,
"learning_rate": 0.00019657357790601814,
"loss": 1.0948,
"step": 766
},
{
"epoch": 0.05920501084076126,
"grad_norm": 0.14307473599910736,
"learning_rate": 0.00019656327287716408,
"loss": 1.0667,
"step": 768
},
{
"epoch": 0.059359190556492414,
"grad_norm": 0.13513712584972382,
"learning_rate": 0.00019655296784831,
"loss": 1.0488,
"step": 770
},
{
"epoch": 0.05951337027222356,
"grad_norm": 0.13991938531398773,
"learning_rate": 0.0001965426628194559,
"loss": 1.0888,
"step": 772
},
{
"epoch": 0.05966754998795471,
"grad_norm": 0.15015999972820282,
"learning_rate": 0.00019653235779060183,
"loss": 1.0774,
"step": 774
},
{
"epoch": 0.05982172970368586,
"grad_norm": 0.16419099271297455,
"learning_rate": 0.00019652205276174774,
"loss": 1.0661,
"step": 776
},
{
"epoch": 0.05997590941941701,
"grad_norm": 0.12072901427745819,
"learning_rate": 0.00019651174773289366,
"loss": 1.0645,
"step": 778
},
{
"epoch": 0.060130089135148154,
"grad_norm": 0.13410696387290955,
"learning_rate": 0.00019650144270403957,
"loss": 1.0677,
"step": 780
},
{
"epoch": 0.060284268850879306,
"grad_norm": 0.13373896479606628,
"learning_rate": 0.0001964911376751855,
"loss": 1.0055,
"step": 782
},
{
"epoch": 0.06043844856661046,
"grad_norm": 0.13043928146362305,
"learning_rate": 0.0001964808326463314,
"loss": 1.0579,
"step": 784
},
{
"epoch": 0.060592628282341604,
"grad_norm": 0.13334155082702637,
"learning_rate": 0.00019647052761747732,
"loss": 1.0781,
"step": 786
},
{
"epoch": 0.060746807998072756,
"grad_norm": 0.14660002291202545,
"learning_rate": 0.00019646022258862326,
"loss": 1.1244,
"step": 788
},
{
"epoch": 0.0609009877138039,
"grad_norm": 0.1240791380405426,
"learning_rate": 0.00019644991755976917,
"loss": 1.0353,
"step": 790
},
{
"epoch": 0.061055167429535054,
"grad_norm": 0.12248943001031876,
"learning_rate": 0.0001964396125309151,
"loss": 1.1292,
"step": 792
},
{
"epoch": 0.0612093471452662,
"grad_norm": 0.1340823471546173,
"learning_rate": 0.000196429307502061,
"loss": 1.0764,
"step": 794
},
{
"epoch": 0.06136352686099735,
"grad_norm": 0.1297413557767868,
"learning_rate": 0.00019641900247320692,
"loss": 1.0998,
"step": 796
},
{
"epoch": 0.0615177065767285,
"grad_norm": 0.13512568175792694,
"learning_rate": 0.00019640869744435286,
"loss": 1.0349,
"step": 798
},
{
"epoch": 0.06167188629245965,
"grad_norm": 0.13964438438415527,
"learning_rate": 0.00019639839241549878,
"loss": 1.0543,
"step": 800
},
{
"epoch": 0.06167188629245965,
"eval_loss": 1.0952669382095337,
"eval_runtime": 185.8383,
"eval_samples_per_second": 91.171,
"eval_steps_per_second": 1.426,
"step": 800
},
{
"epoch": 0.061826066008190794,
"grad_norm": 0.1318446695804596,
"learning_rate": 0.0001963880873866447,
"loss": 1.1469,
"step": 802
},
{
"epoch": 0.061980245723921946,
"grad_norm": 0.13778544962406158,
"learning_rate": 0.0001963777823577906,
"loss": 1.0361,
"step": 804
},
{
"epoch": 0.0621344254396531,
"grad_norm": 0.14804169535636902,
"learning_rate": 0.00019636747732893652,
"loss": 1.0537,
"step": 806
},
{
"epoch": 0.062288605155384244,
"grad_norm": 0.1363479495048523,
"learning_rate": 0.00019635717230008246,
"loss": 1.0819,
"step": 808
},
{
"epoch": 0.062442784871115396,
"grad_norm": 0.12277363240718842,
"learning_rate": 0.00019634686727122838,
"loss": 1.0629,
"step": 810
},
{
"epoch": 0.06259696458684655,
"grad_norm": 0.13027344644069672,
"learning_rate": 0.0001963365622423743,
"loss": 1.0544,
"step": 812
},
{
"epoch": 0.0627511443025777,
"grad_norm": 0.1274079531431198,
"learning_rate": 0.0001963262572135202,
"loss": 1.0685,
"step": 814
},
{
"epoch": 0.06290532401830884,
"grad_norm": 0.1349189281463623,
"learning_rate": 0.00019631595218466612,
"loss": 1.0289,
"step": 816
},
{
"epoch": 0.06305950373403998,
"grad_norm": 0.1265273541212082,
"learning_rate": 0.00019630564715581206,
"loss": 1.0765,
"step": 818
},
{
"epoch": 0.06321368344977114,
"grad_norm": 0.1393941193819046,
"learning_rate": 0.00019629534212695798,
"loss": 1.0918,
"step": 820
},
{
"epoch": 0.06336786316550229,
"grad_norm": 0.12475106865167618,
"learning_rate": 0.0001962850370981039,
"loss": 1.027,
"step": 822
},
{
"epoch": 0.06352204288123343,
"grad_norm": 0.13844382762908936,
"learning_rate": 0.0001962747320692498,
"loss": 1.1482,
"step": 824
},
{
"epoch": 0.0636762225969646,
"grad_norm": 0.1444624364376068,
"learning_rate": 0.00019626442704039572,
"loss": 1.0659,
"step": 826
},
{
"epoch": 0.06383040231269574,
"grad_norm": 0.13939915597438812,
"learning_rate": 0.00019625412201154164,
"loss": 1.0392,
"step": 828
},
{
"epoch": 0.06398458202842688,
"grad_norm": 0.12919913232326508,
"learning_rate": 0.00019624381698268755,
"loss": 1.0566,
"step": 830
},
{
"epoch": 0.06413876174415803,
"grad_norm": 0.1297498196363449,
"learning_rate": 0.00019623351195383347,
"loss": 1.058,
"step": 832
},
{
"epoch": 0.06429294145988919,
"grad_norm": 0.16311457753181458,
"learning_rate": 0.00019622320692497938,
"loss": 1.1175,
"step": 834
},
{
"epoch": 0.06444712117562033,
"grad_norm": 0.14434239268302917,
"learning_rate": 0.0001962129018961253,
"loss": 1.0966,
"step": 836
},
{
"epoch": 0.06460130089135148,
"grad_norm": 0.13500697910785675,
"learning_rate": 0.00019620259686727121,
"loss": 1.138,
"step": 838
},
{
"epoch": 0.06475548060708262,
"grad_norm": 0.13175781071186066,
"learning_rate": 0.00019619229183841716,
"loss": 1.0744,
"step": 840
},
{
"epoch": 0.06490966032281378,
"grad_norm": 0.142098531126976,
"learning_rate": 0.00019618198680956307,
"loss": 1.0686,
"step": 842
},
{
"epoch": 0.06506384003854493,
"grad_norm": 0.16844119131565094,
"learning_rate": 0.00019617168178070899,
"loss": 1.0992,
"step": 844
},
{
"epoch": 0.06521801975427607,
"grad_norm": 0.13562923669815063,
"learning_rate": 0.0001961613767518549,
"loss": 1.0749,
"step": 846
},
{
"epoch": 0.06537219947000723,
"grad_norm": 0.14538466930389404,
"learning_rate": 0.00019615107172300082,
"loss": 1.123,
"step": 848
},
{
"epoch": 0.06552637918573838,
"grad_norm": 0.13058879971504211,
"learning_rate": 0.00019614076669414676,
"loss": 1.0835,
"step": 850
},
{
"epoch": 0.06568055890146952,
"grad_norm": 0.1567140519618988,
"learning_rate": 0.00019613046166529267,
"loss": 1.1157,
"step": 852
},
{
"epoch": 0.06583473861720067,
"grad_norm": 0.12576104700565338,
"learning_rate": 0.0001961201566364386,
"loss": 1.0143,
"step": 854
},
{
"epoch": 0.06598891833293183,
"grad_norm": 0.13823091983795166,
"learning_rate": 0.0001961098516075845,
"loss": 1.0797,
"step": 856
},
{
"epoch": 0.06614309804866297,
"grad_norm": 0.12293639779090881,
"learning_rate": 0.00019609954657873042,
"loss": 1.0808,
"step": 858
},
{
"epoch": 0.06629727776439412,
"grad_norm": 0.13951502740383148,
"learning_rate": 0.00019608924154987636,
"loss": 1.076,
"step": 860
},
{
"epoch": 0.06645145748012526,
"grad_norm": 0.13900773227214813,
"learning_rate": 0.00019607893652102227,
"loss": 1.0846,
"step": 862
},
{
"epoch": 0.06660563719585642,
"grad_norm": 0.14335249364376068,
"learning_rate": 0.0001960686314921682,
"loss": 1.0639,
"step": 864
},
{
"epoch": 0.06675981691158757,
"grad_norm": 0.1712643951177597,
"learning_rate": 0.0001960583264633141,
"loss": 1.1411,
"step": 866
},
{
"epoch": 0.06691399662731871,
"grad_norm": 0.12118082493543625,
"learning_rate": 0.00019604802143446002,
"loss": 1.0807,
"step": 868
},
{
"epoch": 0.06706817634304987,
"grad_norm": 0.141808420419693,
"learning_rate": 0.00019603771640560596,
"loss": 1.0641,
"step": 870
},
{
"epoch": 0.06722235605878102,
"grad_norm": 0.14798308908939362,
"learning_rate": 0.00019602741137675188,
"loss": 1.073,
"step": 872
},
{
"epoch": 0.06737653577451216,
"grad_norm": 0.13768306374549866,
"learning_rate": 0.0001960171063478978,
"loss": 1.0735,
"step": 874
},
{
"epoch": 0.06753071549024331,
"grad_norm": 0.12452355027198792,
"learning_rate": 0.0001960068013190437,
"loss": 1.0509,
"step": 876
},
{
"epoch": 0.06768489520597447,
"grad_norm": 0.1402217000722885,
"learning_rate": 0.00019599649629018962,
"loss": 1.1157,
"step": 878
},
{
"epoch": 0.06783907492170561,
"grad_norm": 0.12509870529174805,
"learning_rate": 0.00019598619126133556,
"loss": 1.0516,
"step": 880
},
{
"epoch": 0.06799325463743676,
"grad_norm": 0.1574297547340393,
"learning_rate": 0.00019597588623248148,
"loss": 1.0823,
"step": 882
},
{
"epoch": 0.0681474343531679,
"grad_norm": 0.14185413718223572,
"learning_rate": 0.0001959655812036274,
"loss": 1.0444,
"step": 884
},
{
"epoch": 0.06830161406889906,
"grad_norm": 0.1380462348461151,
"learning_rate": 0.0001959552761747733,
"loss": 1.1066,
"step": 886
},
{
"epoch": 0.06845579378463021,
"grad_norm": 0.12986746430397034,
"learning_rate": 0.00019594497114591922,
"loss": 1.1006,
"step": 888
},
{
"epoch": 0.06860997350036135,
"grad_norm": 0.13894346356391907,
"learning_rate": 0.00019593466611706514,
"loss": 1.0569,
"step": 890
},
{
"epoch": 0.06876415321609251,
"grad_norm": 0.12822435796260834,
"learning_rate": 0.00019592436108821105,
"loss": 1.0696,
"step": 892
},
{
"epoch": 0.06891833293182366,
"grad_norm": 0.1369408816099167,
"learning_rate": 0.00019591405605935697,
"loss": 1.0691,
"step": 894
},
{
"epoch": 0.0690725126475548,
"grad_norm": 0.13459660112857819,
"learning_rate": 0.00019590375103050288,
"loss": 1.0801,
"step": 896
},
{
"epoch": 0.06922669236328595,
"grad_norm": 0.1299123764038086,
"learning_rate": 0.0001958934460016488,
"loss": 1.0885,
"step": 898
},
{
"epoch": 0.06938087207901711,
"grad_norm": 0.12562230229377747,
"learning_rate": 0.00019588314097279474,
"loss": 1.183,
"step": 900
},
{
"epoch": 0.06938087207901711,
"eval_loss": 1.0944268703460693,
"eval_runtime": 185.3723,
"eval_samples_per_second": 91.4,
"eval_steps_per_second": 1.43,
"step": 900
},
{
"epoch": 0.06953505179474825,
"grad_norm": 0.13996927440166473,
"learning_rate": 0.00019587283594394065,
"loss": 1.0356,
"step": 902
},
{
"epoch": 0.0696892315104794,
"grad_norm": 0.128004252910614,
"learning_rate": 0.00019586253091508657,
"loss": 1.0343,
"step": 904
},
{
"epoch": 0.06984341122621056,
"grad_norm": 0.15650418400764465,
"learning_rate": 0.00019585222588623248,
"loss": 1.1138,
"step": 906
},
{
"epoch": 0.0699975909419417,
"grad_norm": 0.5840476751327515,
"learning_rate": 0.0001958419208573784,
"loss": 1.1785,
"step": 908
},
{
"epoch": 0.07015177065767285,
"grad_norm": 0.15330374240875244,
"learning_rate": 0.00019583161582852434,
"loss": 1.0243,
"step": 910
},
{
"epoch": 0.070305950373404,
"grad_norm": 0.1603543907403946,
"learning_rate": 0.00019582131079967026,
"loss": 1.1228,
"step": 912
},
{
"epoch": 0.07046013008913515,
"grad_norm": 0.14209845662117004,
"learning_rate": 0.00019581100577081617,
"loss": 1.0939,
"step": 914
},
{
"epoch": 0.0706143098048663,
"grad_norm": 0.16117019951343536,
"learning_rate": 0.00019580070074196209,
"loss": 1.1447,
"step": 916
},
{
"epoch": 0.07076848952059744,
"grad_norm": 0.14068694412708282,
"learning_rate": 0.000195790395713108,
"loss": 1.0642,
"step": 918
},
{
"epoch": 0.07092266923632859,
"grad_norm": 0.15248316526412964,
"learning_rate": 0.00019578009068425394,
"loss": 1.0162,
"step": 920
},
{
"epoch": 0.07107684895205975,
"grad_norm": 0.22734233736991882,
"learning_rate": 0.00019576978565539986,
"loss": 1.1123,
"step": 922
},
{
"epoch": 0.0712310286677909,
"grad_norm": 0.1393287032842636,
"learning_rate": 0.00019575948062654577,
"loss": 1.0862,
"step": 924
},
{
"epoch": 0.07138520838352204,
"grad_norm": 0.12911191582679749,
"learning_rate": 0.0001957491755976917,
"loss": 1.0651,
"step": 926
},
{
"epoch": 0.0715393880992532,
"grad_norm": 0.12298440933227539,
"learning_rate": 0.0001957388705688376,
"loss": 1.1227,
"step": 928
},
{
"epoch": 0.07169356781498434,
"grad_norm": 0.14941005408763885,
"learning_rate": 0.00019572856553998352,
"loss": 1.0989,
"step": 930
},
{
"epoch": 0.07184774753071549,
"grad_norm": 0.1411515325307846,
"learning_rate": 0.00019571826051112946,
"loss": 1.0816,
"step": 932
},
{
"epoch": 0.07200192724644663,
"grad_norm": 0.11999720335006714,
"learning_rate": 0.00019570795548227537,
"loss": 1.0306,
"step": 934
},
{
"epoch": 0.0721561069621778,
"grad_norm": 0.1500861495733261,
"learning_rate": 0.0001956976504534213,
"loss": 1.0678,
"step": 936
},
{
"epoch": 0.07231028667790894,
"grad_norm": 0.12102475017309189,
"learning_rate": 0.0001956873454245672,
"loss": 1.0534,
"step": 938
},
{
"epoch": 0.07246446639364008,
"grad_norm": 0.11554603278636932,
"learning_rate": 0.00019567704039571312,
"loss": 1.0535,
"step": 940
},
{
"epoch": 0.07261864610937123,
"grad_norm": 0.12290264666080475,
"learning_rate": 0.00019566673536685903,
"loss": 1.0738,
"step": 942
},
{
"epoch": 0.07277282582510239,
"grad_norm": 0.17740991711616516,
"learning_rate": 0.00019565643033800495,
"loss": 1.0811,
"step": 944
},
{
"epoch": 0.07292700554083353,
"grad_norm": 0.14767777919769287,
"learning_rate": 0.00019564612530915086,
"loss": 1.105,
"step": 946
},
{
"epoch": 0.07308118525656468,
"grad_norm": 0.13773177564144135,
"learning_rate": 0.00019563582028029678,
"loss": 1.0983,
"step": 948
},
{
"epoch": 0.07323536497229584,
"grad_norm": 0.13891370594501495,
"learning_rate": 0.0001956255152514427,
"loss": 1.1349,
"step": 950
},
{
"epoch": 0.07338954468802698,
"grad_norm": 0.14717017114162445,
"learning_rate": 0.00019561521022258863,
"loss": 1.134,
"step": 952
},
{
"epoch": 0.07354372440375813,
"grad_norm": 0.15095743536949158,
"learning_rate": 0.00019560490519373455,
"loss": 1.063,
"step": 954
},
{
"epoch": 0.07369790411948927,
"grad_norm": 0.12851206958293915,
"learning_rate": 0.00019559460016488046,
"loss": 1.1005,
"step": 956
},
{
"epoch": 0.07385208383522043,
"grad_norm": 0.13364006578922272,
"learning_rate": 0.00019558429513602638,
"loss": 1.0429,
"step": 958
},
{
"epoch": 0.07400626355095158,
"grad_norm": 0.1326039433479309,
"learning_rate": 0.0001955739901071723,
"loss": 1.1586,
"step": 960
},
{
"epoch": 0.07416044326668272,
"grad_norm": 0.13149486482143402,
"learning_rate": 0.00019556368507831824,
"loss": 1.109,
"step": 962
},
{
"epoch": 0.07431462298241387,
"grad_norm": 0.1189669519662857,
"learning_rate": 0.00019555338004946415,
"loss": 1.0462,
"step": 964
},
{
"epoch": 0.07446880269814503,
"grad_norm": 0.14341482520103455,
"learning_rate": 0.00019554307502061007,
"loss": 1.0623,
"step": 966
},
{
"epoch": 0.07462298241387617,
"grad_norm": 0.14133721590042114,
"learning_rate": 0.00019553276999175598,
"loss": 1.0945,
"step": 968
},
{
"epoch": 0.07477716212960732,
"grad_norm": 0.1351941078901291,
"learning_rate": 0.0001955224649629019,
"loss": 1.0327,
"step": 970
},
{
"epoch": 0.07493134184533848,
"grad_norm": 0.12836019694805145,
"learning_rate": 0.00019551215993404784,
"loss": 1.069,
"step": 972
},
{
"epoch": 0.07508552156106962,
"grad_norm": 0.13199055194854736,
"learning_rate": 0.00019550185490519375,
"loss": 1.0323,
"step": 974
},
{
"epoch": 0.07523970127680077,
"grad_norm": 0.14991353452205658,
"learning_rate": 0.00019549154987633967,
"loss": 1.0625,
"step": 976
},
{
"epoch": 0.07539388099253191,
"grad_norm": 0.13832435011863708,
"learning_rate": 0.00019548124484748558,
"loss": 1.1031,
"step": 978
},
{
"epoch": 0.07554806070826307,
"grad_norm": 0.12351599335670471,
"learning_rate": 0.0001954709398186315,
"loss": 1.0286,
"step": 980
},
{
"epoch": 0.07570224042399422,
"grad_norm": 0.12360050529241562,
"learning_rate": 0.00019546063478977744,
"loss": 1.0652,
"step": 982
},
{
"epoch": 0.07585642013972536,
"grad_norm": 0.13384872674942017,
"learning_rate": 0.00019545032976092335,
"loss": 1.1125,
"step": 984
},
{
"epoch": 0.07601059985545652,
"grad_norm": 0.13200527429580688,
"learning_rate": 0.00019544002473206927,
"loss": 1.0727,
"step": 986
},
{
"epoch": 0.07616477957118767,
"grad_norm": 0.143647700548172,
"learning_rate": 0.00019542971970321518,
"loss": 1.1207,
"step": 988
},
{
"epoch": 0.07631895928691881,
"grad_norm": 0.13605177402496338,
"learning_rate": 0.0001954194146743611,
"loss": 1.0225,
"step": 990
},
{
"epoch": 0.07647313900264996,
"grad_norm": 0.12646125257015228,
"learning_rate": 0.00019540910964550701,
"loss": 1.11,
"step": 992
},
{
"epoch": 0.07662731871838112,
"grad_norm": 0.132467120885849,
"learning_rate": 0.00019539880461665293,
"loss": 1.1092,
"step": 994
},
{
"epoch": 0.07678149843411226,
"grad_norm": 0.12461701035499573,
"learning_rate": 0.00019538849958779884,
"loss": 1.0854,
"step": 996
},
{
"epoch": 0.07693567814984341,
"grad_norm": 0.13430501520633698,
"learning_rate": 0.00019537819455894476,
"loss": 1.2,
"step": 998
},
{
"epoch": 0.07708985786557455,
"grad_norm": 0.12623916566371918,
"learning_rate": 0.00019536788953009067,
"loss": 1.0522,
"step": 1000
},
{
"epoch": 0.07708985786557455,
"eval_loss": 1.0930616855621338,
"eval_runtime": 185.4001,
"eval_samples_per_second": 91.386,
"eval_steps_per_second": 1.429,
"step": 1000
},
{
"epoch": 0.07724403758130571,
"grad_norm": 0.11760087311267853,
"learning_rate": 0.00019535758450123662,
"loss": 1.1566,
"step": 1002
},
{
"epoch": 0.07739821729703686,
"grad_norm": 0.145633727312088,
"learning_rate": 0.00019534727947238253,
"loss": 1.094,
"step": 1004
},
{
"epoch": 0.077552397012768,
"grad_norm": 0.1311633288860321,
"learning_rate": 0.00019533697444352845,
"loss": 1.0792,
"step": 1006
},
{
"epoch": 0.07770657672849916,
"grad_norm": 0.12563548982143402,
"learning_rate": 0.00019532666941467436,
"loss": 1.0601,
"step": 1008
},
{
"epoch": 0.07786075644423031,
"grad_norm": 0.14429886639118195,
"learning_rate": 0.00019531636438582028,
"loss": 1.0926,
"step": 1010
},
{
"epoch": 0.07801493615996145,
"grad_norm": 0.13131891191005707,
"learning_rate": 0.0001953060593569662,
"loss": 1.1012,
"step": 1012
},
{
"epoch": 0.0781691158756926,
"grad_norm": 0.14185300469398499,
"learning_rate": 0.00019529575432811213,
"loss": 1.1113,
"step": 1014
},
{
"epoch": 0.07832329559142376,
"grad_norm": 0.14298418164253235,
"learning_rate": 0.00019528544929925805,
"loss": 1.0909,
"step": 1016
},
{
"epoch": 0.0784774753071549,
"grad_norm": 0.1339821219444275,
"learning_rate": 0.00019527514427040396,
"loss": 1.0994,
"step": 1018
},
{
"epoch": 0.07863165502288605,
"grad_norm": 0.1252928525209427,
"learning_rate": 0.00019526483924154988,
"loss": 1.0316,
"step": 1020
},
{
"epoch": 0.0787858347386172,
"grad_norm": 0.1277703046798706,
"learning_rate": 0.0001952545342126958,
"loss": 1.1067,
"step": 1022
},
{
"epoch": 0.07894001445434835,
"grad_norm": 0.12644124031066895,
"learning_rate": 0.00019524422918384173,
"loss": 1.0176,
"step": 1024
},
{
"epoch": 0.0790941941700795,
"grad_norm": 0.13443627953529358,
"learning_rate": 0.00019523392415498765,
"loss": 1.0754,
"step": 1026
},
{
"epoch": 0.07924837388581064,
"grad_norm": 0.1895609050989151,
"learning_rate": 0.00019522361912613356,
"loss": 1.0551,
"step": 1028
},
{
"epoch": 0.0794025536015418,
"grad_norm": 0.1372397392988205,
"learning_rate": 0.00019521331409727948,
"loss": 1.0442,
"step": 1030
},
{
"epoch": 0.07955673331727295,
"grad_norm": 0.14173942804336548,
"learning_rate": 0.0001952030090684254,
"loss": 1.0692,
"step": 1032
},
{
"epoch": 0.0797109130330041,
"grad_norm": 0.12321804463863373,
"learning_rate": 0.00019519270403957134,
"loss": 1.0276,
"step": 1034
},
{
"epoch": 0.07986509274873524,
"grad_norm": 0.12327130138874054,
"learning_rate": 0.00019518239901071725,
"loss": 1.0376,
"step": 1036
},
{
"epoch": 0.0800192724644664,
"grad_norm": 0.12301841378211975,
"learning_rate": 0.00019517209398186317,
"loss": 1.0887,
"step": 1038
},
{
"epoch": 0.08017345218019754,
"grad_norm": 0.1429559886455536,
"learning_rate": 0.00019516178895300908,
"loss": 1.0321,
"step": 1040
},
{
"epoch": 0.08032763189592869,
"grad_norm": 0.13955366611480713,
"learning_rate": 0.000195151483924155,
"loss": 1.1081,
"step": 1042
},
{
"epoch": 0.08048181161165983,
"grad_norm": 0.13553303480148315,
"learning_rate": 0.00019514117889530094,
"loss": 1.0252,
"step": 1044
},
{
"epoch": 0.080635991327391,
"grad_norm": 0.14100225269794464,
"learning_rate": 0.00019513087386644685,
"loss": 1.1071,
"step": 1046
},
{
"epoch": 0.08079017104312214,
"grad_norm": 0.14522643387317657,
"learning_rate": 0.00019512056883759277,
"loss": 1.0653,
"step": 1048
},
{
"epoch": 0.08094435075885328,
"grad_norm": 0.14540371298789978,
"learning_rate": 0.00019511026380873868,
"loss": 1.01,
"step": 1050
},
{
"epoch": 0.08109853047458444,
"grad_norm": 0.1459018737077713,
"learning_rate": 0.0001950999587798846,
"loss": 1.1147,
"step": 1052
},
{
"epoch": 0.08125271019031559,
"grad_norm": 0.12590867280960083,
"learning_rate": 0.0001950896537510305,
"loss": 1.0685,
"step": 1054
},
{
"epoch": 0.08140688990604673,
"grad_norm": 0.11943504959344864,
"learning_rate": 0.00019507934872217643,
"loss": 1.0854,
"step": 1056
},
{
"epoch": 0.08156106962177788,
"grad_norm": 0.12039398401975632,
"learning_rate": 0.00019506904369332234,
"loss": 1.1397,
"step": 1058
},
{
"epoch": 0.08171524933750904,
"grad_norm": 0.1411554217338562,
"learning_rate": 0.00019505873866446826,
"loss": 1.1271,
"step": 1060
},
{
"epoch": 0.08186942905324018,
"grad_norm": 0.1402871012687683,
"learning_rate": 0.00019504843363561417,
"loss": 1.0425,
"step": 1062
},
{
"epoch": 0.08202360876897133,
"grad_norm": 0.13545840978622437,
"learning_rate": 0.00019503812860676011,
"loss": 1.0571,
"step": 1064
},
{
"epoch": 0.08217778848470249,
"grad_norm": 0.12789209187030792,
"learning_rate": 0.00019502782357790603,
"loss": 1.0596,
"step": 1066
},
{
"epoch": 0.08233196820043363,
"grad_norm": 0.13018928468227386,
"learning_rate": 0.00019501751854905194,
"loss": 1.1188,
"step": 1068
},
{
"epoch": 0.08248614791616478,
"grad_norm": 0.12482234835624695,
"learning_rate": 0.00019500721352019786,
"loss": 1.0831,
"step": 1070
},
{
"epoch": 0.08264032763189592,
"grad_norm": 0.11897309869527817,
"learning_rate": 0.00019499690849134377,
"loss": 1.0658,
"step": 1072
},
{
"epoch": 0.08279450734762708,
"grad_norm": 0.12954497337341309,
"learning_rate": 0.00019498660346248972,
"loss": 1.0204,
"step": 1074
},
{
"epoch": 0.08294868706335823,
"grad_norm": 0.14220042526721954,
"learning_rate": 0.00019497629843363563,
"loss": 1.1101,
"step": 1076
},
{
"epoch": 0.08310286677908937,
"grad_norm": 0.1631559580564499,
"learning_rate": 0.00019496599340478155,
"loss": 1.1352,
"step": 1078
},
{
"epoch": 0.08325704649482052,
"grad_norm": 0.13439539074897766,
"learning_rate": 0.00019495568837592746,
"loss": 1.0108,
"step": 1080
},
{
"epoch": 0.08341122621055168,
"grad_norm": 0.12389718741178513,
"learning_rate": 0.00019494538334707338,
"loss": 1.0155,
"step": 1082
},
{
"epoch": 0.08356540592628282,
"grad_norm": 0.1241556853055954,
"learning_rate": 0.00019493507831821932,
"loss": 1.1428,
"step": 1084
},
{
"epoch": 0.08371958564201397,
"grad_norm": 0.13087880611419678,
"learning_rate": 0.00019492477328936523,
"loss": 1.0876,
"step": 1086
},
{
"epoch": 0.08387376535774513,
"grad_norm": 0.12431449443101883,
"learning_rate": 0.00019491446826051115,
"loss": 1.0758,
"step": 1088
},
{
"epoch": 0.08402794507347627,
"grad_norm": 0.13807635009288788,
"learning_rate": 0.00019490416323165706,
"loss": 1.0902,
"step": 1090
},
{
"epoch": 0.08418212478920742,
"grad_norm": 0.12751048803329468,
"learning_rate": 0.00019489385820280298,
"loss": 1.0732,
"step": 1092
},
{
"epoch": 0.08433630450493856,
"grad_norm": 0.15594707429409027,
"learning_rate": 0.00019488355317394892,
"loss": 1.1115,
"step": 1094
},
{
"epoch": 0.08449048422066972,
"grad_norm": 0.11647301912307739,
"learning_rate": 0.00019487324814509483,
"loss": 1.1592,
"step": 1096
},
{
"epoch": 0.08464466393640087,
"grad_norm": 0.13609850406646729,
"learning_rate": 0.00019486294311624075,
"loss": 1.1139,
"step": 1098
},
{
"epoch": 0.08479884365213201,
"grad_norm": 0.1234198659658432,
"learning_rate": 0.00019485263808738666,
"loss": 1.0682,
"step": 1100
},
{
"epoch": 0.08479884365213201,
"eval_loss": 1.0920624732971191,
"eval_runtime": 185.5142,
"eval_samples_per_second": 91.33,
"eval_steps_per_second": 1.428,
"step": 1100
},
{
"epoch": 0.08495302336786316,
"grad_norm": 0.1375039666891098,
"learning_rate": 0.00019484233305853258,
"loss": 1.0585,
"step": 1102
},
{
"epoch": 0.08510720308359432,
"grad_norm": 0.14471521973609924,
"learning_rate": 0.0001948320280296785,
"loss": 1.1115,
"step": 1104
},
{
"epoch": 0.08526138279932546,
"grad_norm": 0.12425632029771805,
"learning_rate": 0.0001948217230008244,
"loss": 1.0501,
"step": 1106
},
{
"epoch": 0.08541556251505661,
"grad_norm": 0.1161596029996872,
"learning_rate": 0.00019481141797197032,
"loss": 1.0182,
"step": 1108
},
{
"epoch": 0.08556974223078777,
"grad_norm": 0.11700072139501572,
"learning_rate": 0.00019480111294311624,
"loss": 1.0579,
"step": 1110
},
{
"epoch": 0.08572392194651891,
"grad_norm": 0.14330415427684784,
"learning_rate": 0.00019479080791426215,
"loss": 1.1211,
"step": 1112
},
{
"epoch": 0.08587810166225006,
"grad_norm": 0.14039026200771332,
"learning_rate": 0.00019478050288540807,
"loss": 1.0826,
"step": 1114
},
{
"epoch": 0.0860322813779812,
"grad_norm": 0.14031362533569336,
"learning_rate": 0.000194770197856554,
"loss": 1.0871,
"step": 1116
},
{
"epoch": 0.08618646109371236,
"grad_norm": 0.12351037561893463,
"learning_rate": 0.00019475989282769993,
"loss": 1.001,
"step": 1118
},
{
"epoch": 0.08634064080944351,
"grad_norm": 0.11667052656412125,
"learning_rate": 0.00019474958779884584,
"loss": 1.0421,
"step": 1120
},
{
"epoch": 0.08649482052517465,
"grad_norm": 0.1489124447107315,
"learning_rate": 0.00019473928276999175,
"loss": 1.1644,
"step": 1122
},
{
"epoch": 0.0866490002409058,
"grad_norm": 0.1338202804327011,
"learning_rate": 0.00019472897774113767,
"loss": 1.1239,
"step": 1124
},
{
"epoch": 0.08680317995663696,
"grad_norm": 0.13266493380069733,
"learning_rate": 0.0001947186727122836,
"loss": 1.0839,
"step": 1126
},
{
"epoch": 0.0869573596723681,
"grad_norm": 0.13726286590099335,
"learning_rate": 0.00019470836768342953,
"loss": 1.1325,
"step": 1128
},
{
"epoch": 0.08711153938809925,
"grad_norm": 0.14077100157737732,
"learning_rate": 0.00019469806265457544,
"loss": 1.0429,
"step": 1130
},
{
"epoch": 0.08726571910383041,
"grad_norm": 0.1362866312265396,
"learning_rate": 0.00019468775762572136,
"loss": 1.0715,
"step": 1132
},
{
"epoch": 0.08741989881956155,
"grad_norm": 0.12472223490476608,
"learning_rate": 0.00019467745259686727,
"loss": 1.0503,
"step": 1134
},
{
"epoch": 0.0875740785352927,
"grad_norm": 0.1350635141134262,
"learning_rate": 0.0001946671475680132,
"loss": 1.0498,
"step": 1136
},
{
"epoch": 0.08772825825102384,
"grad_norm": 0.1424301117658615,
"learning_rate": 0.00019465684253915913,
"loss": 1.1589,
"step": 1138
},
{
"epoch": 0.087882437966755,
"grad_norm": 0.12365067005157471,
"learning_rate": 0.00019464653751030504,
"loss": 1.1065,
"step": 1140
},
{
"epoch": 0.08803661768248615,
"grad_norm": 0.16497495770454407,
"learning_rate": 0.00019463623248145096,
"loss": 1.0189,
"step": 1142
},
{
"epoch": 0.0881907973982173,
"grad_norm": 0.1381298303604126,
"learning_rate": 0.00019462592745259687,
"loss": 1.0426,
"step": 1144
},
{
"epoch": 0.08834497711394845,
"grad_norm": 0.15007291734218597,
"learning_rate": 0.00019461562242374282,
"loss": 1.1108,
"step": 1146
},
{
"epoch": 0.0884991568296796,
"grad_norm": 0.19384606182575226,
"learning_rate": 0.00019460531739488873,
"loss": 1.0664,
"step": 1148
},
{
"epoch": 0.08865333654541074,
"grad_norm": 0.12032177299261093,
"learning_rate": 0.00019459501236603465,
"loss": 1.018,
"step": 1150
},
{
"epoch": 0.08880751626114189,
"grad_norm": 0.1197669506072998,
"learning_rate": 0.00019458470733718056,
"loss": 1.071,
"step": 1152
},
{
"epoch": 0.08896169597687305,
"grad_norm": 0.12108784914016724,
"learning_rate": 0.00019457440230832647,
"loss": 1.0499,
"step": 1154
},
{
"epoch": 0.0891158756926042,
"grad_norm": 0.1270270049571991,
"learning_rate": 0.0001945640972794724,
"loss": 1.1172,
"step": 1156
},
{
"epoch": 0.08927005540833534,
"grad_norm": 0.13599786162376404,
"learning_rate": 0.0001945537922506183,
"loss": 1.103,
"step": 1158
},
{
"epoch": 0.08942423512406648,
"grad_norm": 0.12051045894622803,
"learning_rate": 0.00019454348722176422,
"loss": 1.0905,
"step": 1160
},
{
"epoch": 0.08957841483979764,
"grad_norm": 0.12117696553468704,
"learning_rate": 0.00019453318219291013,
"loss": 1.0611,
"step": 1162
},
{
"epoch": 0.08973259455552879,
"grad_norm": 0.13710887730121613,
"learning_rate": 0.00019452287716405605,
"loss": 1.0242,
"step": 1164
},
{
"epoch": 0.08988677427125993,
"grad_norm": 0.1160813644528389,
"learning_rate": 0.000194512572135202,
"loss": 1.0863,
"step": 1166
},
{
"epoch": 0.09004095398699109,
"grad_norm": 0.1754099279642105,
"learning_rate": 0.0001945022671063479,
"loss": 1.0938,
"step": 1168
},
{
"epoch": 0.09019513370272224,
"grad_norm": 0.1331128627061844,
"learning_rate": 0.00019449196207749382,
"loss": 1.0692,
"step": 1170
},
{
"epoch": 0.09034931341845338,
"grad_norm": 0.13422611355781555,
"learning_rate": 0.00019448165704863974,
"loss": 1.0699,
"step": 1172
},
{
"epoch": 0.09050349313418453,
"grad_norm": 0.12999802827835083,
"learning_rate": 0.00019447135201978565,
"loss": 1.0957,
"step": 1174
},
{
"epoch": 0.09065767284991569,
"grad_norm": 0.13413815200328827,
"learning_rate": 0.0001944610469909316,
"loss": 1.0869,
"step": 1176
},
{
"epoch": 0.09081185256564683,
"grad_norm": 0.12901006639003754,
"learning_rate": 0.0001944507419620775,
"loss": 1.0442,
"step": 1178
},
{
"epoch": 0.09096603228137798,
"grad_norm": 0.11824194341897964,
"learning_rate": 0.00019444043693322342,
"loss": 1.0935,
"step": 1180
},
{
"epoch": 0.09112021199710912,
"grad_norm": 0.14895616471767426,
"learning_rate": 0.00019443013190436934,
"loss": 1.0624,
"step": 1182
},
{
"epoch": 0.09127439171284028,
"grad_norm": 0.13515722751617432,
"learning_rate": 0.00019441982687551525,
"loss": 1.0797,
"step": 1184
},
{
"epoch": 0.09142857142857143,
"grad_norm": 0.13411575555801392,
"learning_rate": 0.00019440952184666117,
"loss": 1.0637,
"step": 1186
},
{
"epoch": 0.09158275114430257,
"grad_norm": 0.12519463896751404,
"learning_rate": 0.0001943992168178071,
"loss": 1.0608,
"step": 1188
},
{
"epoch": 0.09173693086003373,
"grad_norm": 0.1267428696155548,
"learning_rate": 0.00019438891178895302,
"loss": 1.0182,
"step": 1190
},
{
"epoch": 0.09189111057576488,
"grad_norm": 0.13116560876369476,
"learning_rate": 0.00019437860676009894,
"loss": 1.1139,
"step": 1192
},
{
"epoch": 0.09204529029149602,
"grad_norm": 0.14659713208675385,
"learning_rate": 0.00019436830173124485,
"loss": 1.1275,
"step": 1194
},
{
"epoch": 0.09219947000722717,
"grad_norm": 0.12913885712623596,
"learning_rate": 0.00019435799670239077,
"loss": 1.0858,
"step": 1196
},
{
"epoch": 0.09235364972295833,
"grad_norm": 0.12855856120586395,
"learning_rate": 0.0001943476916735367,
"loss": 1.0811,
"step": 1198
},
{
"epoch": 0.09250782943868947,
"grad_norm": 0.1391747146844864,
"learning_rate": 0.00019433738664468263,
"loss": 1.0146,
"step": 1200
},
{
"epoch": 0.09250782943868947,
"eval_loss": 1.0912913084030151,
"eval_runtime": 185.3661,
"eval_samples_per_second": 91.403,
"eval_steps_per_second": 1.43,
"step": 1200
},
{
"epoch": 0.09266200915442062,
"grad_norm": 0.13186782598495483,
"learning_rate": 0.00019432708161582854,
"loss": 1.1017,
"step": 1202
},
{
"epoch": 0.09281618887015176,
"grad_norm": 0.12913943827152252,
"learning_rate": 0.00019431677658697446,
"loss": 1.1027,
"step": 1204
},
{
"epoch": 0.09297036858588292,
"grad_norm": 0.1349743753671646,
"learning_rate": 0.00019430647155812037,
"loss": 1.1023,
"step": 1206
},
{
"epoch": 0.09312454830161407,
"grad_norm": 0.12534667551517487,
"learning_rate": 0.00019429616652926629,
"loss": 1.0659,
"step": 1208
},
{
"epoch": 0.09327872801734521,
"grad_norm": 0.11720700562000275,
"learning_rate": 0.0001942858615004122,
"loss": 1.0532,
"step": 1210
},
{
"epoch": 0.09343290773307637,
"grad_norm": 0.1364222913980484,
"learning_rate": 0.00019427555647155812,
"loss": 1.0575,
"step": 1212
},
{
"epoch": 0.09358708744880752,
"grad_norm": 0.15532977879047394,
"learning_rate": 0.00019426525144270403,
"loss": 1.1145,
"step": 1214
},
{
"epoch": 0.09374126716453866,
"grad_norm": 0.1377478837966919,
"learning_rate": 0.00019425494641384995,
"loss": 1.0505,
"step": 1216
},
{
"epoch": 0.09389544688026981,
"grad_norm": 0.1273409128189087,
"learning_rate": 0.0001942446413849959,
"loss": 1.0873,
"step": 1218
},
{
"epoch": 0.09404962659600097,
"grad_norm": 0.11990435421466827,
"learning_rate": 0.0001942343363561418,
"loss": 1.0829,
"step": 1220
},
{
"epoch": 0.09420380631173211,
"grad_norm": 0.14191892743110657,
"learning_rate": 0.00019422403132728772,
"loss": 1.0992,
"step": 1222
},
{
"epoch": 0.09435798602746326,
"grad_norm": 0.14520397782325745,
"learning_rate": 0.00019421372629843363,
"loss": 1.0712,
"step": 1224
},
{
"epoch": 0.09451216574319442,
"grad_norm": 0.13780727982521057,
"learning_rate": 0.00019420342126957955,
"loss": 0.9943,
"step": 1226
},
{
"epoch": 0.09466634545892556,
"grad_norm": 0.13550738990306854,
"learning_rate": 0.0001941931162407255,
"loss": 1.1264,
"step": 1228
},
{
"epoch": 0.09482052517465671,
"grad_norm": 0.12125276774168015,
"learning_rate": 0.0001941828112118714,
"loss": 1.1207,
"step": 1230
},
{
"epoch": 0.09497470489038785,
"grad_norm": 0.14529301226139069,
"learning_rate": 0.00019417250618301732,
"loss": 1.144,
"step": 1232
},
{
"epoch": 0.09512888460611901,
"grad_norm": 0.15477551519870758,
"learning_rate": 0.00019416220115416323,
"loss": 1.0568,
"step": 1234
},
{
"epoch": 0.09528306432185016,
"grad_norm": 0.1299963742494583,
"learning_rate": 0.00019415189612530915,
"loss": 1.0235,
"step": 1236
},
{
"epoch": 0.0954372440375813,
"grad_norm": 0.1372281014919281,
"learning_rate": 0.0001941415910964551,
"loss": 1.0764,
"step": 1238
},
{
"epoch": 0.09559142375331245,
"grad_norm": 0.1247306764125824,
"learning_rate": 0.000194131286067601,
"loss": 1.1345,
"step": 1240
},
{
"epoch": 0.09574560346904361,
"grad_norm": 0.1330571472644806,
"learning_rate": 0.00019412098103874692,
"loss": 1.1596,
"step": 1242
},
{
"epoch": 0.09589978318477475,
"grad_norm": 0.15787385404109955,
"learning_rate": 0.00019411067600989284,
"loss": 1.1067,
"step": 1244
},
{
"epoch": 0.0960539629005059,
"grad_norm": 0.12646274268627167,
"learning_rate": 0.00019410037098103875,
"loss": 1.0769,
"step": 1246
},
{
"epoch": 0.09620814261623706,
"grad_norm": 0.16424262523651123,
"learning_rate": 0.0001940900659521847,
"loss": 1.0459,
"step": 1248
},
{
"epoch": 0.0963623223319682,
"grad_norm": 0.1401062309741974,
"learning_rate": 0.0001940797609233306,
"loss": 1.1308,
"step": 1250
},
{
"epoch": 0.09651650204769935,
"grad_norm": 0.13971561193466187,
"learning_rate": 0.00019406945589447652,
"loss": 1.1457,
"step": 1252
},
{
"epoch": 0.0966706817634305,
"grad_norm": 0.13544687628746033,
"learning_rate": 0.00019405915086562244,
"loss": 1.0532,
"step": 1254
},
{
"epoch": 0.09682486147916165,
"grad_norm": 0.13527531921863556,
"learning_rate": 0.00019404884583676835,
"loss": 1.0376,
"step": 1256
},
{
"epoch": 0.0969790411948928,
"grad_norm": 0.1731848120689392,
"learning_rate": 0.0001940385408079143,
"loss": 1.2252,
"step": 1258
},
{
"epoch": 0.09713322091062394,
"grad_norm": 0.13142083585262299,
"learning_rate": 0.0001940282357790602,
"loss": 1.0254,
"step": 1260
},
{
"epoch": 0.09728740062635509,
"grad_norm": 0.13390247523784637,
"learning_rate": 0.00019401793075020612,
"loss": 1.0448,
"step": 1262
},
{
"epoch": 0.09744158034208625,
"grad_norm": 0.15188650786876678,
"learning_rate": 0.00019400762572135204,
"loss": 1.1019,
"step": 1264
},
{
"epoch": 0.0975957600578174,
"grad_norm": 0.14055617153644562,
"learning_rate": 0.00019399732069249795,
"loss": 1.0835,
"step": 1266
},
{
"epoch": 0.09774993977354854,
"grad_norm": 0.12209255248308182,
"learning_rate": 0.00019398701566364387,
"loss": 1.0675,
"step": 1268
},
{
"epoch": 0.0979041194892797,
"grad_norm": 0.14639706909656525,
"learning_rate": 0.00019397671063478978,
"loss": 1.049,
"step": 1270
},
{
"epoch": 0.09805829920501084,
"grad_norm": 0.13672591745853424,
"learning_rate": 0.0001939664056059357,
"loss": 1.1057,
"step": 1272
},
{
"epoch": 0.09821247892074199,
"grad_norm": 0.1522635966539383,
"learning_rate": 0.00019395610057708161,
"loss": 1.14,
"step": 1274
},
{
"epoch": 0.09836665863647313,
"grad_norm": 0.13887491822242737,
"learning_rate": 0.00019394579554822753,
"loss": 1.069,
"step": 1276
},
{
"epoch": 0.09852083835220429,
"grad_norm": 0.13854965567588806,
"learning_rate": 0.00019393549051937344,
"loss": 1.0704,
"step": 1278
},
{
"epoch": 0.09867501806793544,
"grad_norm": 0.12839765846729279,
"learning_rate": 0.00019392518549051939,
"loss": 1.0512,
"step": 1280
},
{
"epoch": 0.09882919778366658,
"grad_norm": 0.1270405352115631,
"learning_rate": 0.0001939148804616653,
"loss": 1.0251,
"step": 1282
},
{
"epoch": 0.09898337749939773,
"grad_norm": 0.1269143521785736,
"learning_rate": 0.00019390457543281122,
"loss": 1.0433,
"step": 1284
},
{
"epoch": 0.09913755721512889,
"grad_norm": 0.14292192459106445,
"learning_rate": 0.00019389427040395713,
"loss": 1.1507,
"step": 1286
},
{
"epoch": 0.09929173693086003,
"grad_norm": 0.12512263655662537,
"learning_rate": 0.00019388396537510305,
"loss": 1.0918,
"step": 1288
},
{
"epoch": 0.09944591664659118,
"grad_norm": 0.11927679181098938,
"learning_rate": 0.000193873660346249,
"loss": 1.0924,
"step": 1290
},
{
"epoch": 0.09960009636232234,
"grad_norm": 0.13639990985393524,
"learning_rate": 0.0001938633553173949,
"loss": 1.1024,
"step": 1292
},
{
"epoch": 0.09975427607805348,
"grad_norm": 0.142363503575325,
"learning_rate": 0.00019385305028854082,
"loss": 1.021,
"step": 1294
},
{
"epoch": 0.09990845579378463,
"grad_norm": 0.1389359086751938,
"learning_rate": 0.00019384274525968673,
"loss": 1.0269,
"step": 1296
},
{
"epoch": 0.10006263550951577,
"grad_norm": 0.15595073997974396,
"learning_rate": 0.00019383244023083265,
"loss": 1.0913,
"step": 1298
},
{
"epoch": 0.10021681522524693,
"grad_norm": 0.1324295848608017,
"learning_rate": 0.0001938221352019786,
"loss": 1.1001,
"step": 1300
},
{
"epoch": 0.10021681522524693,
"eval_loss": 1.0909266471862793,
"eval_runtime": 185.4116,
"eval_samples_per_second": 91.38,
"eval_steps_per_second": 1.429,
"step": 1300
},
{
"epoch": 0.10037099494097808,
"grad_norm": 0.139576256275177,
"learning_rate": 0.0001938118301731245,
"loss": 1.1147,
"step": 1302
},
{
"epoch": 0.10052517465670922,
"grad_norm": 0.12854811549186707,
"learning_rate": 0.00019380152514427042,
"loss": 1.0973,
"step": 1304
},
{
"epoch": 0.10067935437244037,
"grad_norm": 0.1245393380522728,
"learning_rate": 0.00019379122011541633,
"loss": 1.0485,
"step": 1306
},
{
"epoch": 0.10083353408817153,
"grad_norm": 0.13261497020721436,
"learning_rate": 0.00019378091508656225,
"loss": 1.156,
"step": 1308
},
{
"epoch": 0.10098771380390267,
"grad_norm": 0.1255144327878952,
"learning_rate": 0.0001937706100577082,
"loss": 1.0852,
"step": 1310
},
{
"epoch": 0.10114189351963382,
"grad_norm": 0.1412706971168518,
"learning_rate": 0.0001937603050288541,
"loss": 1.0766,
"step": 1312
},
{
"epoch": 0.10129607323536498,
"grad_norm": 0.1281047761440277,
"learning_rate": 0.00019375000000000002,
"loss": 1.0824,
"step": 1314
},
{
"epoch": 0.10145025295109612,
"grad_norm": 0.13307350873947144,
"learning_rate": 0.00019373969497114594,
"loss": 1.0887,
"step": 1316
},
{
"epoch": 0.10160443266682727,
"grad_norm": 0.1287691742181778,
"learning_rate": 0.00019372938994229185,
"loss": 1.0705,
"step": 1318
},
{
"epoch": 0.10175861238255841,
"grad_norm": 0.1303441971540451,
"learning_rate": 0.00019371908491343777,
"loss": 1.1684,
"step": 1320
},
{
"epoch": 0.10191279209828957,
"grad_norm": 0.13304616510868073,
"learning_rate": 0.00019370877988458368,
"loss": 1.0944,
"step": 1322
},
{
"epoch": 0.10206697181402072,
"grad_norm": 0.13905592262744904,
"learning_rate": 0.0001936984748557296,
"loss": 1.0915,
"step": 1324
},
{
"epoch": 0.10222115152975186,
"grad_norm": 0.13225632905960083,
"learning_rate": 0.0001936881698268755,
"loss": 1.0418,
"step": 1326
},
{
"epoch": 0.10237533124548302,
"grad_norm": 0.1267402619123459,
"learning_rate": 0.00019367786479802142,
"loss": 1.0446,
"step": 1328
},
{
"epoch": 0.10252951096121417,
"grad_norm": 0.1439935863018036,
"learning_rate": 0.00019366755976916737,
"loss": 1.0582,
"step": 1330
},
{
"epoch": 0.10268369067694531,
"grad_norm": 0.1267223060131073,
"learning_rate": 0.00019365725474031328,
"loss": 1.0176,
"step": 1332
},
{
"epoch": 0.10283787039267646,
"grad_norm": 0.1298942118883133,
"learning_rate": 0.0001936469497114592,
"loss": 1.0552,
"step": 1334
},
{
"epoch": 0.10299205010840762,
"grad_norm": 0.13010933995246887,
"learning_rate": 0.0001936366446826051,
"loss": 1.0848,
"step": 1336
},
{
"epoch": 0.10314622982413876,
"grad_norm": 0.13728559017181396,
"learning_rate": 0.00019362633965375103,
"loss": 1.0779,
"step": 1338
},
{
"epoch": 0.10330040953986991,
"grad_norm": 0.13863548636436462,
"learning_rate": 0.00019361603462489697,
"loss": 1.0326,
"step": 1340
},
{
"epoch": 0.10345458925560105,
"grad_norm": 0.12995532155036926,
"learning_rate": 0.00019360572959604288,
"loss": 1.1427,
"step": 1342
},
{
"epoch": 0.10360876897133221,
"grad_norm": 0.13650789856910706,
"learning_rate": 0.0001935954245671888,
"loss": 1.0528,
"step": 1344
},
{
"epoch": 0.10376294868706336,
"grad_norm": 0.1336941123008728,
"learning_rate": 0.0001935851195383347,
"loss": 1.1155,
"step": 1346
},
{
"epoch": 0.1039171284027945,
"grad_norm": 0.13927003741264343,
"learning_rate": 0.00019357481450948063,
"loss": 1.0551,
"step": 1348
},
{
"epoch": 0.10407130811852566,
"grad_norm": 0.14504994451999664,
"learning_rate": 0.00019356450948062657,
"loss": 1.1014,
"step": 1350
},
{
"epoch": 0.10422548783425681,
"grad_norm": 0.15796230733394623,
"learning_rate": 0.00019355420445177248,
"loss": 1.2115,
"step": 1352
},
{
"epoch": 0.10437966754998795,
"grad_norm": 0.1317984163761139,
"learning_rate": 0.0001935438994229184,
"loss": 1.0933,
"step": 1354
},
{
"epoch": 0.1045338472657191,
"grad_norm": 0.13189563155174255,
"learning_rate": 0.00019353359439406431,
"loss": 1.0664,
"step": 1356
},
{
"epoch": 0.10468802698145026,
"grad_norm": 0.1323234885931015,
"learning_rate": 0.00019352328936521023,
"loss": 1.0824,
"step": 1358
},
{
"epoch": 0.1048422066971814,
"grad_norm": 0.13659097254276276,
"learning_rate": 0.00019351298433635614,
"loss": 1.0334,
"step": 1360
},
{
"epoch": 0.10499638641291255,
"grad_norm": 0.11882172524929047,
"learning_rate": 0.0001935026793075021,
"loss": 1.0401,
"step": 1362
},
{
"epoch": 0.1051505661286437,
"grad_norm": 0.13025067746639252,
"learning_rate": 0.000193492374278648,
"loss": 1.0838,
"step": 1364
},
{
"epoch": 0.10530474584437485,
"grad_norm": 0.1249939501285553,
"learning_rate": 0.00019348206924979392,
"loss": 1.0349,
"step": 1366
},
{
"epoch": 0.105458925560106,
"grad_norm": 0.12588031589984894,
"learning_rate": 0.00019347176422093983,
"loss": 1.079,
"step": 1368
},
{
"epoch": 0.10561310527583714,
"grad_norm": 0.12548890709877014,
"learning_rate": 0.00019346145919208575,
"loss": 1.0062,
"step": 1370
},
{
"epoch": 0.1057672849915683,
"grad_norm": 0.13328798115253448,
"learning_rate": 0.00019345115416323166,
"loss": 1.1154,
"step": 1372
},
{
"epoch": 0.10592146470729945,
"grad_norm": 0.1443903148174286,
"learning_rate": 0.00019344084913437758,
"loss": 1.097,
"step": 1374
},
{
"epoch": 0.1060756444230306,
"grad_norm": 0.12835648655891418,
"learning_rate": 0.0001934305441055235,
"loss": 1.0723,
"step": 1376
},
{
"epoch": 0.10622982413876174,
"grad_norm": 0.13068312406539917,
"learning_rate": 0.0001934202390766694,
"loss": 1.1128,
"step": 1378
},
{
"epoch": 0.1063840038544929,
"grad_norm": 0.13628961145877838,
"learning_rate": 0.00019340993404781532,
"loss": 1.1146,
"step": 1380
},
{
"epoch": 0.10653818357022404,
"grad_norm": 0.12263484299182892,
"learning_rate": 0.00019339962901896126,
"loss": 1.0947,
"step": 1382
},
{
"epoch": 0.10669236328595519,
"grad_norm": 0.12684424221515656,
"learning_rate": 0.00019338932399010718,
"loss": 1.059,
"step": 1384
},
{
"epoch": 0.10684654300168633,
"grad_norm": 0.1421595960855484,
"learning_rate": 0.0001933790189612531,
"loss": 1.0688,
"step": 1386
},
{
"epoch": 0.10700072271741749,
"grad_norm": 0.12416025251150131,
"learning_rate": 0.000193368713932399,
"loss": 1.0905,
"step": 1388
},
{
"epoch": 0.10715490243314864,
"grad_norm": 0.1284332126379013,
"learning_rate": 0.00019335840890354492,
"loss": 1.0612,
"step": 1390
},
{
"epoch": 0.10730908214887978,
"grad_norm": 0.1282491385936737,
"learning_rate": 0.00019334810387469086,
"loss": 1.0851,
"step": 1392
},
{
"epoch": 0.10746326186461094,
"grad_norm": 0.13221289217472076,
"learning_rate": 0.00019333779884583678,
"loss": 1.0446,
"step": 1394
},
{
"epoch": 0.10761744158034209,
"grad_norm": 0.12401736527681351,
"learning_rate": 0.0001933274938169827,
"loss": 1.0826,
"step": 1396
},
{
"epoch": 0.10777162129607323,
"grad_norm": 0.14316771924495697,
"learning_rate": 0.0001933171887881286,
"loss": 1.1136,
"step": 1398
},
{
"epoch": 0.10792580101180438,
"grad_norm": 0.17223364114761353,
"learning_rate": 0.00019330688375927452,
"loss": 1.0752,
"step": 1400
},
{
"epoch": 0.10792580101180438,
"eval_loss": 1.0899540185928345,
"eval_runtime": 185.3818,
"eval_samples_per_second": 91.395,
"eval_steps_per_second": 1.429,
"step": 1400
},
{
"epoch": 0.10807998072753554,
"grad_norm": 0.15027141571044922,
"learning_rate": 0.00019329657873042047,
"loss": 1.0371,
"step": 1402
},
{
"epoch": 0.10823416044326668,
"grad_norm": 0.19876505434513092,
"learning_rate": 0.00019328627370156638,
"loss": 1.0312,
"step": 1404
},
{
"epoch": 0.10838834015899783,
"grad_norm": 0.1422131210565567,
"learning_rate": 0.0001932759686727123,
"loss": 1.0597,
"step": 1406
},
{
"epoch": 0.10854251987472899,
"grad_norm": 0.13597753643989563,
"learning_rate": 0.0001932656636438582,
"loss": 1.0939,
"step": 1408
},
{
"epoch": 0.10869669959046013,
"grad_norm": 0.16808953881263733,
"learning_rate": 0.00019325535861500413,
"loss": 1.1221,
"step": 1410
},
{
"epoch": 0.10885087930619128,
"grad_norm": 0.14884881675243378,
"learning_rate": 0.00019324505358615007,
"loss": 1.1114,
"step": 1412
},
{
"epoch": 0.10900505902192242,
"grad_norm": 0.12680503726005554,
"learning_rate": 0.00019323474855729598,
"loss": 1.1032,
"step": 1414
},
{
"epoch": 0.10915923873765358,
"grad_norm": 0.13997766375541687,
"learning_rate": 0.0001932244435284419,
"loss": 1.0799,
"step": 1416
},
{
"epoch": 0.10931341845338473,
"grad_norm": 0.1343669593334198,
"learning_rate": 0.0001932141384995878,
"loss": 1.0778,
"step": 1418
},
{
"epoch": 0.10946759816911587,
"grad_norm": 0.12029851973056793,
"learning_rate": 0.00019320383347073373,
"loss": 1.1021,
"step": 1420
},
{
"epoch": 0.10962177788484702,
"grad_norm": 0.1322990357875824,
"learning_rate": 0.00019319352844187967,
"loss": 1.1061,
"step": 1422
},
{
"epoch": 0.10977595760057818,
"grad_norm": 0.13710594177246094,
"learning_rate": 0.00019318322341302558,
"loss": 1.0786,
"step": 1424
},
{
"epoch": 0.10993013731630932,
"grad_norm": 0.11956049501895905,
"learning_rate": 0.0001931729183841715,
"loss": 1.0711,
"step": 1426
},
{
"epoch": 0.11008431703204047,
"grad_norm": 0.139973446726799,
"learning_rate": 0.00019316261335531741,
"loss": 1.1162,
"step": 1428
},
{
"epoch": 0.11023849674777163,
"grad_norm": 0.1525941640138626,
"learning_rate": 0.00019315230832646333,
"loss": 1.0572,
"step": 1430
},
{
"epoch": 0.11039267646350277,
"grad_norm": 0.1349973976612091,
"learning_rate": 0.00019314200329760924,
"loss": 1.1048,
"step": 1432
},
{
"epoch": 0.11054685617923392,
"grad_norm": 0.1305711269378662,
"learning_rate": 0.00019313169826875516,
"loss": 1.0841,
"step": 1434
},
{
"epoch": 0.11070103589496506,
"grad_norm": 0.16756822168827057,
"learning_rate": 0.00019312139323990107,
"loss": 1.0736,
"step": 1436
},
{
"epoch": 0.11085521561069622,
"grad_norm": 0.13367486000061035,
"learning_rate": 0.000193111088211047,
"loss": 1.0774,
"step": 1438
},
{
"epoch": 0.11100939532642737,
"grad_norm": 0.12484605610370636,
"learning_rate": 0.0001931007831821929,
"loss": 1.1196,
"step": 1440
},
{
"epoch": 0.11116357504215851,
"grad_norm": 0.14064739644527435,
"learning_rate": 0.00019309047815333885,
"loss": 1.1101,
"step": 1442
},
{
"epoch": 0.11131775475788966,
"grad_norm": 0.1366916447877884,
"learning_rate": 0.00019308017312448476,
"loss": 1.111,
"step": 1444
},
{
"epoch": 0.11147193447362082,
"grad_norm": 0.11520934104919434,
"learning_rate": 0.00019306986809563068,
"loss": 1.065,
"step": 1446
},
{
"epoch": 0.11162611418935196,
"grad_norm": 0.15567731857299805,
"learning_rate": 0.0001930595630667766,
"loss": 1.1036,
"step": 1448
},
{
"epoch": 0.11178029390508311,
"grad_norm": 0.13628730177879333,
"learning_rate": 0.0001930492580379225,
"loss": 1.0717,
"step": 1450
},
{
"epoch": 0.11193447362081427,
"grad_norm": 0.1359964907169342,
"learning_rate": 0.00019303895300906842,
"loss": 1.0986,
"step": 1452
},
{
"epoch": 0.11208865333654541,
"grad_norm": 0.16372162103652954,
"learning_rate": 0.00019302864798021436,
"loss": 1.0306,
"step": 1454
},
{
"epoch": 0.11224283305227656,
"grad_norm": 0.1724134087562561,
"learning_rate": 0.00019301834295136028,
"loss": 1.0753,
"step": 1456
},
{
"epoch": 0.1123970127680077,
"grad_norm": 0.13646383583545685,
"learning_rate": 0.0001930080379225062,
"loss": 1.0975,
"step": 1458
},
{
"epoch": 0.11255119248373886,
"grad_norm": 0.1522134691476822,
"learning_rate": 0.0001929977328936521,
"loss": 1.1031,
"step": 1460
},
{
"epoch": 0.11270537219947001,
"grad_norm": 0.13656160235404968,
"learning_rate": 0.00019298742786479802,
"loss": 1.0602,
"step": 1462
},
{
"epoch": 0.11285955191520115,
"grad_norm": 0.14140130579471588,
"learning_rate": 0.00019297712283594396,
"loss": 1.1289,
"step": 1464
},
{
"epoch": 0.1130137316309323,
"grad_norm": 0.1383032351732254,
"learning_rate": 0.00019296681780708988,
"loss": 1.0797,
"step": 1466
},
{
"epoch": 0.11316791134666346,
"grad_norm": 0.15723556280136108,
"learning_rate": 0.0001929565127782358,
"loss": 1.1156,
"step": 1468
},
{
"epoch": 0.1133220910623946,
"grad_norm": 0.13462230563163757,
"learning_rate": 0.0001929462077493817,
"loss": 1.0953,
"step": 1470
},
{
"epoch": 0.11347627077812575,
"grad_norm": 0.14101319015026093,
"learning_rate": 0.00019293590272052762,
"loss": 1.1152,
"step": 1472
},
{
"epoch": 0.11363045049385691,
"grad_norm": 0.13705132901668549,
"learning_rate": 0.00019292559769167357,
"loss": 1.0886,
"step": 1474
},
{
"epoch": 0.11378463020958805,
"grad_norm": 0.1206672340631485,
"learning_rate": 0.00019291529266281948,
"loss": 1.0995,
"step": 1476
},
{
"epoch": 0.1139388099253192,
"grad_norm": 0.13666383922100067,
"learning_rate": 0.0001929049876339654,
"loss": 1.058,
"step": 1478
},
{
"epoch": 0.11409298964105034,
"grad_norm": 0.1265423446893692,
"learning_rate": 0.0001928946826051113,
"loss": 1.0676,
"step": 1480
},
{
"epoch": 0.1142471693567815,
"grad_norm": 0.1528097242116928,
"learning_rate": 0.00019288437757625723,
"loss": 1.0675,
"step": 1482
},
{
"epoch": 0.11440134907251265,
"grad_norm": 0.16541676223278046,
"learning_rate": 0.00019287407254740314,
"loss": 1.1539,
"step": 1484
},
{
"epoch": 0.1145555287882438,
"grad_norm": 0.20383091270923615,
"learning_rate": 0.00019286376751854906,
"loss": 1.0472,
"step": 1486
},
{
"epoch": 0.11470970850397495,
"grad_norm": 0.13806484639644623,
"learning_rate": 0.00019285346248969497,
"loss": 1.0408,
"step": 1488
},
{
"epoch": 0.1148638882197061,
"grad_norm": 0.1251746118068695,
"learning_rate": 0.00019284315746084089,
"loss": 1.1207,
"step": 1490
},
{
"epoch": 0.11501806793543724,
"grad_norm": 0.13218504190444946,
"learning_rate": 0.0001928328524319868,
"loss": 1.1131,
"step": 1492
},
{
"epoch": 0.11517224765116839,
"grad_norm": 0.21616914868354797,
"learning_rate": 0.00019282254740313274,
"loss": 1.1103,
"step": 1494
},
{
"epoch": 0.11532642736689955,
"grad_norm": 0.1437305361032486,
"learning_rate": 0.00019281224237427866,
"loss": 1.1243,
"step": 1496
},
{
"epoch": 0.11548060708263069,
"grad_norm": 0.13094168901443481,
"learning_rate": 0.00019280193734542457,
"loss": 1.1012,
"step": 1498
},
{
"epoch": 0.11563478679836184,
"grad_norm": 0.12384334206581116,
"learning_rate": 0.0001927916323165705,
"loss": 1.05,
"step": 1500
},
{
"epoch": 0.11563478679836184,
"eval_loss": 1.0905406475067139,
"eval_runtime": 185.4473,
"eval_samples_per_second": 91.363,
"eval_steps_per_second": 1.429,
"step": 1500
},
{
"epoch": 0.11578896651409298,
"grad_norm": 0.12807106971740723,
"learning_rate": 0.0001927813272877164,
"loss": 1.0754,
"step": 1502
},
{
"epoch": 0.11594314622982414,
"grad_norm": 0.12517131865024567,
"learning_rate": 0.00019277102225886234,
"loss": 1.1017,
"step": 1504
},
{
"epoch": 0.11609732594555529,
"grad_norm": 0.1704496592283249,
"learning_rate": 0.00019276071723000826,
"loss": 1.098,
"step": 1506
},
{
"epoch": 0.11625150566128643,
"grad_norm": 0.12152231484651566,
"learning_rate": 0.00019275041220115417,
"loss": 1.0738,
"step": 1508
},
{
"epoch": 0.11640568537701759,
"grad_norm": 0.12952156364917755,
"learning_rate": 0.0001927401071723001,
"loss": 1.0479,
"step": 1510
},
{
"epoch": 0.11655986509274874,
"grad_norm": 0.1499640941619873,
"learning_rate": 0.000192729802143446,
"loss": 1.1046,
"step": 1512
},
{
"epoch": 0.11671404480847988,
"grad_norm": 0.1331593543291092,
"learning_rate": 0.00019271949711459195,
"loss": 1.1219,
"step": 1514
},
{
"epoch": 0.11686822452421103,
"grad_norm": 0.1368558406829834,
"learning_rate": 0.00019270919208573786,
"loss": 1.1357,
"step": 1516
},
{
"epoch": 0.11702240423994219,
"grad_norm": 0.12278290838003159,
"learning_rate": 0.00019269888705688378,
"loss": 1.1079,
"step": 1518
},
{
"epoch": 0.11717658395567333,
"grad_norm": 0.11737775802612305,
"learning_rate": 0.0001926885820280297,
"loss": 1.1224,
"step": 1520
},
{
"epoch": 0.11733076367140448,
"grad_norm": 0.13017341494560242,
"learning_rate": 0.0001926782769991756,
"loss": 1.0648,
"step": 1522
},
{
"epoch": 0.11748494338713562,
"grad_norm": 0.11939583718776703,
"learning_rate": 0.00019266797197032155,
"loss": 1.0899,
"step": 1524
},
{
"epoch": 0.11763912310286678,
"grad_norm": 0.12446755915880203,
"learning_rate": 0.00019265766694146746,
"loss": 1.0626,
"step": 1526
},
{
"epoch": 0.11779330281859793,
"grad_norm": 0.13369430601596832,
"learning_rate": 0.00019264736191261338,
"loss": 1.0526,
"step": 1528
},
{
"epoch": 0.11794748253432907,
"grad_norm": 0.13470736145973206,
"learning_rate": 0.0001926370568837593,
"loss": 1.0946,
"step": 1530
},
{
"epoch": 0.11810166225006023,
"grad_norm": 0.14193174242973328,
"learning_rate": 0.0001926267518549052,
"loss": 1.1089,
"step": 1532
},
{
"epoch": 0.11825584196579138,
"grad_norm": 0.14893026649951935,
"learning_rate": 0.00019261644682605112,
"loss": 1.0606,
"step": 1534
},
{
"epoch": 0.11841002168152252,
"grad_norm": 0.20594976842403412,
"learning_rate": 0.00019260614179719704,
"loss": 1.0375,
"step": 1536
},
{
"epoch": 0.11856420139725367,
"grad_norm": 0.15287873148918152,
"learning_rate": 0.00019259583676834295,
"loss": 1.1414,
"step": 1538
},
{
"epoch": 0.11871838111298483,
"grad_norm": 0.1275177299976349,
"learning_rate": 0.00019258553173948887,
"loss": 1.1084,
"step": 1540
},
{
"epoch": 0.11887256082871597,
"grad_norm": 0.20036157965660095,
"learning_rate": 0.00019257522671063478,
"loss": 1.1261,
"step": 1542
},
{
"epoch": 0.11902674054444712,
"grad_norm": 0.14492087066173553,
"learning_rate": 0.0001925649216817807,
"loss": 1.1137,
"step": 1544
},
{
"epoch": 0.11918092026017826,
"grad_norm": 0.1259312629699707,
"learning_rate": 0.00019255461665292664,
"loss": 1.0409,
"step": 1546
},
{
"epoch": 0.11933509997590942,
"grad_norm": 0.1296795755624771,
"learning_rate": 0.00019254431162407255,
"loss": 1.0332,
"step": 1548
},
{
"epoch": 0.11948927969164057,
"grad_norm": 0.13372276723384857,
"learning_rate": 0.00019253400659521847,
"loss": 1.1087,
"step": 1550
},
{
"epoch": 0.11964345940737171,
"grad_norm": 0.14354725182056427,
"learning_rate": 0.00019252370156636438,
"loss": 1.0398,
"step": 1552
},
{
"epoch": 0.11979763912310287,
"grad_norm": 0.1378318965435028,
"learning_rate": 0.0001925133965375103,
"loss": 1.0542,
"step": 1554
},
{
"epoch": 0.11995181883883402,
"grad_norm": 0.12171255797147751,
"learning_rate": 0.00019250309150865624,
"loss": 1.0935,
"step": 1556
},
{
"epoch": 0.12010599855456516,
"grad_norm": 0.11905664205551147,
"learning_rate": 0.00019249278647980215,
"loss": 1.0097,
"step": 1558
},
{
"epoch": 0.12026017827029631,
"grad_norm": 0.12854760885238647,
"learning_rate": 0.00019248248145094807,
"loss": 1.1517,
"step": 1560
},
{
"epoch": 0.12041435798602747,
"grad_norm": 0.247908353805542,
"learning_rate": 0.00019247217642209398,
"loss": 1.0876,
"step": 1562
},
{
"epoch": 0.12056853770175861,
"grad_norm": 0.1441553235054016,
"learning_rate": 0.0001924618713932399,
"loss": 1.1414,
"step": 1564
},
{
"epoch": 0.12072271741748976,
"grad_norm": 0.13307887315750122,
"learning_rate": 0.00019245156636438584,
"loss": 1.1012,
"step": 1566
},
{
"epoch": 0.12087689713322092,
"grad_norm": 0.14192406833171844,
"learning_rate": 0.00019244126133553176,
"loss": 1.1418,
"step": 1568
},
{
"epoch": 0.12103107684895206,
"grad_norm": 0.11530864983797073,
"learning_rate": 0.00019243095630667767,
"loss": 1.0776,
"step": 1570
},
{
"epoch": 0.12118525656468321,
"grad_norm": 0.13385196030139923,
"learning_rate": 0.00019242065127782359,
"loss": 1.1311,
"step": 1572
},
{
"epoch": 0.12133943628041435,
"grad_norm": 0.1308089643716812,
"learning_rate": 0.0001924103462489695,
"loss": 1.0625,
"step": 1574
},
{
"epoch": 0.12149361599614551,
"grad_norm": 0.11851842701435089,
"learning_rate": 0.00019240004122011544,
"loss": 1.0182,
"step": 1576
},
{
"epoch": 0.12164779571187666,
"grad_norm": 0.2496737688779831,
"learning_rate": 0.00019238973619126136,
"loss": 1.0746,
"step": 1578
},
{
"epoch": 0.1218019754276078,
"grad_norm": 0.12962055206298828,
"learning_rate": 0.00019237943116240727,
"loss": 1.0245,
"step": 1580
},
{
"epoch": 0.12195615514333895,
"grad_norm": 0.13170978426933289,
"learning_rate": 0.0001923691261335532,
"loss": 0.9897,
"step": 1582
},
{
"epoch": 0.12211033485907011,
"grad_norm": 0.13226309418678284,
"learning_rate": 0.0001923588211046991,
"loss": 1.1035,
"step": 1584
},
{
"epoch": 0.12226451457480125,
"grad_norm": 0.11901077628135681,
"learning_rate": 0.00019234851607584502,
"loss": 1.0084,
"step": 1586
},
{
"epoch": 0.1224186942905324,
"grad_norm": 0.15274369716644287,
"learning_rate": 0.00019233821104699093,
"loss": 1.1436,
"step": 1588
},
{
"epoch": 0.12257287400626356,
"grad_norm": 0.11832466721534729,
"learning_rate": 0.00019232790601813685,
"loss": 1.0179,
"step": 1590
},
{
"epoch": 0.1227270537219947,
"grad_norm": 0.13038666546344757,
"learning_rate": 0.00019231760098928276,
"loss": 1.0779,
"step": 1592
},
{
"epoch": 0.12288123343772585,
"grad_norm": 0.12837626039981842,
"learning_rate": 0.00019230729596042868,
"loss": 1.1404,
"step": 1594
},
{
"epoch": 0.123035413153457,
"grad_norm": 0.1400509923696518,
"learning_rate": 0.00019229699093157462,
"loss": 1.1132,
"step": 1596
},
{
"epoch": 0.12318959286918815,
"grad_norm": 0.13757595419883728,
"learning_rate": 0.00019228668590272053,
"loss": 1.0816,
"step": 1598
},
{
"epoch": 0.1233437725849193,
"grad_norm": 0.12403321266174316,
"learning_rate": 0.00019227638087386645,
"loss": 1.039,
"step": 1600
},
{
"epoch": 0.1233437725849193,
"eval_loss": 1.0888522863388062,
"eval_runtime": 185.2371,
"eval_samples_per_second": 91.467,
"eval_steps_per_second": 1.431,
"step": 1600
},
{
"epoch": 0.12349795230065044,
"grad_norm": 0.12380605190992355,
"learning_rate": 0.00019226607584501236,
"loss": 1.0903,
"step": 1602
},
{
"epoch": 0.12365213201638159,
"grad_norm": 0.13564443588256836,
"learning_rate": 0.00019225577081615828,
"loss": 1.0768,
"step": 1604
},
{
"epoch": 0.12380631173211275,
"grad_norm": 0.1533685177564621,
"learning_rate": 0.00019224546578730422,
"loss": 1.0852,
"step": 1606
},
{
"epoch": 0.12396049144784389,
"grad_norm": 0.1163390502333641,
"learning_rate": 0.00019223516075845014,
"loss": 1.0574,
"step": 1608
},
{
"epoch": 0.12411467116357504,
"grad_norm": 0.13867324590682983,
"learning_rate": 0.00019222485572959605,
"loss": 1.0992,
"step": 1610
},
{
"epoch": 0.1242688508793062,
"grad_norm": 0.12759087979793549,
"learning_rate": 0.00019221455070074197,
"loss": 1.0738,
"step": 1612
},
{
"epoch": 0.12442303059503734,
"grad_norm": 0.1237189844250679,
"learning_rate": 0.00019220424567188788,
"loss": 1.0974,
"step": 1614
},
{
"epoch": 0.12457721031076849,
"grad_norm": 0.13331052660942078,
"learning_rate": 0.00019219394064303382,
"loss": 1.0917,
"step": 1616
},
{
"epoch": 0.12473139002649963,
"grad_norm": 0.1290212869644165,
"learning_rate": 0.00019218363561417974,
"loss": 1.0696,
"step": 1618
},
{
"epoch": 0.12488556974223079,
"grad_norm": 0.13309410214424133,
"learning_rate": 0.00019217333058532565,
"loss": 1.043,
"step": 1620
},
{
"epoch": 0.12503974945796192,
"grad_norm": 0.13453248143196106,
"learning_rate": 0.00019216302555647157,
"loss": 1.0435,
"step": 1622
},
{
"epoch": 0.1251939291736931,
"grad_norm": 0.11639372259378433,
"learning_rate": 0.00019215272052761748,
"loss": 1.0579,
"step": 1624
},
{
"epoch": 0.12534810888942424,
"grad_norm": 0.13231517374515533,
"learning_rate": 0.0001921424154987634,
"loss": 1.1268,
"step": 1626
},
{
"epoch": 0.1255022886051554,
"grad_norm": 0.1349351406097412,
"learning_rate": 0.00019213211046990934,
"loss": 1.1599,
"step": 1628
},
{
"epoch": 0.12565646832088653,
"grad_norm": 0.13710346817970276,
"learning_rate": 0.00019212180544105525,
"loss": 1.0866,
"step": 1630
},
{
"epoch": 0.12581064803661768,
"grad_norm": 0.14535072445869446,
"learning_rate": 0.00019211150041220117,
"loss": 1.0445,
"step": 1632
},
{
"epoch": 0.12596482775234882,
"grad_norm": 0.11799806356430054,
"learning_rate": 0.00019210119538334708,
"loss": 1.0525,
"step": 1634
},
{
"epoch": 0.12611900746807997,
"grad_norm": 0.13399624824523926,
"learning_rate": 0.000192090890354493,
"loss": 1.0246,
"step": 1636
},
{
"epoch": 0.12627318718381114,
"grad_norm": 0.14404788613319397,
"learning_rate": 0.00019208058532563894,
"loss": 1.0582,
"step": 1638
},
{
"epoch": 0.1264273668995423,
"grad_norm": 0.14395713806152344,
"learning_rate": 0.00019207028029678486,
"loss": 1.0686,
"step": 1640
},
{
"epoch": 0.12658154661527343,
"grad_norm": 0.13249294459819794,
"learning_rate": 0.00019205997526793077,
"loss": 1.1286,
"step": 1642
},
{
"epoch": 0.12673572633100458,
"grad_norm": 0.12791812419891357,
"learning_rate": 0.00019204967023907669,
"loss": 1.062,
"step": 1644
},
{
"epoch": 0.12688990604673572,
"grad_norm": 0.12210959941148758,
"learning_rate": 0.0001920393652102226,
"loss": 1.0419,
"step": 1646
},
{
"epoch": 0.12704408576246687,
"grad_norm": 0.13438813388347626,
"learning_rate": 0.00019202906018136852,
"loss": 1.0589,
"step": 1648
},
{
"epoch": 0.127198265478198,
"grad_norm": 0.12953762710094452,
"learning_rate": 0.00019201875515251443,
"loss": 1.0128,
"step": 1650
},
{
"epoch": 0.1273524451939292,
"grad_norm": 0.1318603903055191,
"learning_rate": 0.00019200845012366035,
"loss": 1.073,
"step": 1652
},
{
"epoch": 0.12750662490966033,
"grad_norm": 0.12956051528453827,
"learning_rate": 0.00019199814509480626,
"loss": 1.0489,
"step": 1654
},
{
"epoch": 0.12766080462539148,
"grad_norm": 0.13501368463039398,
"learning_rate": 0.00019198784006595218,
"loss": 1.0198,
"step": 1656
},
{
"epoch": 0.12781498434112262,
"grad_norm": 0.13902342319488525,
"learning_rate": 0.00019197753503709812,
"loss": 1.0512,
"step": 1658
},
{
"epoch": 0.12796916405685377,
"grad_norm": 0.15590503811836243,
"learning_rate": 0.00019196723000824403,
"loss": 1.1782,
"step": 1660
},
{
"epoch": 0.1281233437725849,
"grad_norm": 0.13954932987689972,
"learning_rate": 0.00019195692497938995,
"loss": 1.0421,
"step": 1662
},
{
"epoch": 0.12827752348831606,
"grad_norm": 0.11550859361886978,
"learning_rate": 0.00019194661995053586,
"loss": 1.086,
"step": 1664
},
{
"epoch": 0.1284317032040472,
"grad_norm": 0.12175869196653366,
"learning_rate": 0.00019193631492168178,
"loss": 1.0704,
"step": 1666
},
{
"epoch": 0.12858588291977838,
"grad_norm": 0.13503512740135193,
"learning_rate": 0.00019192600989282772,
"loss": 1.1166,
"step": 1668
},
{
"epoch": 0.12874006263550952,
"grad_norm": 0.12849009037017822,
"learning_rate": 0.00019191570486397363,
"loss": 1.0315,
"step": 1670
},
{
"epoch": 0.12889424235124067,
"grad_norm": 0.12484319508075714,
"learning_rate": 0.00019190539983511955,
"loss": 1.0737,
"step": 1672
},
{
"epoch": 0.1290484220669718,
"grad_norm": 0.1364014446735382,
"learning_rate": 0.00019189509480626546,
"loss": 1.0619,
"step": 1674
},
{
"epoch": 0.12920260178270296,
"grad_norm": 0.12930172681808472,
"learning_rate": 0.00019188478977741138,
"loss": 1.046,
"step": 1676
},
{
"epoch": 0.1293567814984341,
"grad_norm": 0.13860805332660675,
"learning_rate": 0.00019187448474855732,
"loss": 1.0832,
"step": 1678
},
{
"epoch": 0.12951096121416525,
"grad_norm": 0.1379111111164093,
"learning_rate": 0.00019186417971970324,
"loss": 1.1406,
"step": 1680
},
{
"epoch": 0.12966514092989642,
"grad_norm": 0.1349123865365982,
"learning_rate": 0.00019185387469084915,
"loss": 1.1055,
"step": 1682
},
{
"epoch": 0.12981932064562757,
"grad_norm": 0.13304142653942108,
"learning_rate": 0.00019184356966199507,
"loss": 1.0392,
"step": 1684
},
{
"epoch": 0.1299735003613587,
"grad_norm": 0.12159105390310287,
"learning_rate": 0.00019183326463314098,
"loss": 1.0548,
"step": 1686
},
{
"epoch": 0.13012768007708986,
"grad_norm": 0.12661418318748474,
"learning_rate": 0.00019182295960428692,
"loss": 1.0588,
"step": 1688
},
{
"epoch": 0.130281859792821,
"grad_norm": 0.13691510260105133,
"learning_rate": 0.00019181265457543284,
"loss": 1.0854,
"step": 1690
},
{
"epoch": 0.13043603950855215,
"grad_norm": 0.1401318609714508,
"learning_rate": 0.00019180234954657875,
"loss": 1.0864,
"step": 1692
},
{
"epoch": 0.1305902192242833,
"grad_norm": 0.1355384737253189,
"learning_rate": 0.00019179204451772467,
"loss": 1.058,
"step": 1694
},
{
"epoch": 0.13074439894001447,
"grad_norm": 0.13987474143505096,
"learning_rate": 0.00019178173948887058,
"loss": 1.06,
"step": 1696
},
{
"epoch": 0.1308985786557456,
"grad_norm": 0.14350661635398865,
"learning_rate": 0.0001917714344600165,
"loss": 1.0731,
"step": 1698
},
{
"epoch": 0.13105275837147676,
"grad_norm": 0.12443742901086807,
"learning_rate": 0.0001917611294311624,
"loss": 1.0987,
"step": 1700
},
{
"epoch": 0.13105275837147676,
"eval_loss": 1.0880467891693115,
"eval_runtime": 185.5457,
"eval_samples_per_second": 91.314,
"eval_steps_per_second": 1.428,
"step": 1700
},
{
"epoch": 0.1312069380872079,
"grad_norm": 0.10956554859876633,
"learning_rate": 0.00019175082440230833,
"loss": 1.0393,
"step": 1702
},
{
"epoch": 0.13136111780293905,
"grad_norm": 0.11846137791872025,
"learning_rate": 0.00019174051937345424,
"loss": 1.0998,
"step": 1704
},
{
"epoch": 0.1315152975186702,
"grad_norm": 0.11894328892230988,
"learning_rate": 0.00019173021434460016,
"loss": 1.1007,
"step": 1706
},
{
"epoch": 0.13166947723440134,
"grad_norm": 0.11090514808893204,
"learning_rate": 0.00019171990931574607,
"loss": 1.0343,
"step": 1708
},
{
"epoch": 0.1318236569501325,
"grad_norm": 0.1276719868183136,
"learning_rate": 0.000191709604286892,
"loss": 1.0392,
"step": 1710
},
{
"epoch": 0.13197783666586366,
"grad_norm": 0.12342885881662369,
"learning_rate": 0.00019169929925803793,
"loss": 1.063,
"step": 1712
},
{
"epoch": 0.1321320163815948,
"grad_norm": 0.1237882748246193,
"learning_rate": 0.00019168899422918384,
"loss": 1.0558,
"step": 1714
},
{
"epoch": 0.13228619609732595,
"grad_norm": 0.12958785891532898,
"learning_rate": 0.00019167868920032976,
"loss": 1.0493,
"step": 1716
},
{
"epoch": 0.1324403758130571,
"grad_norm": 0.1181110367178917,
"learning_rate": 0.00019166838417147567,
"loss": 1.0668,
"step": 1718
},
{
"epoch": 0.13259455552878824,
"grad_norm": 0.12053950875997543,
"learning_rate": 0.00019165807914262162,
"loss": 1.0392,
"step": 1720
},
{
"epoch": 0.13274873524451938,
"grad_norm": 0.11725175380706787,
"learning_rate": 0.00019164777411376753,
"loss": 1.0188,
"step": 1722
},
{
"epoch": 0.13290291496025053,
"grad_norm": 0.12475614994764328,
"learning_rate": 0.00019163746908491344,
"loss": 1.0134,
"step": 1724
},
{
"epoch": 0.1330570946759817,
"grad_norm": 0.1231207475066185,
"learning_rate": 0.00019162716405605936,
"loss": 1.0309,
"step": 1726
},
{
"epoch": 0.13321127439171285,
"grad_norm": 0.1269765943288803,
"learning_rate": 0.00019161685902720527,
"loss": 1.0918,
"step": 1728
},
{
"epoch": 0.133365454107444,
"grad_norm": 0.12103556841611862,
"learning_rate": 0.00019160655399835122,
"loss": 1.0453,
"step": 1730
},
{
"epoch": 0.13351963382317514,
"grad_norm": 0.12427771091461182,
"learning_rate": 0.00019159624896949713,
"loss": 1.1544,
"step": 1732
},
{
"epoch": 0.13367381353890628,
"grad_norm": 0.13416282832622528,
"learning_rate": 0.00019158594394064305,
"loss": 1.0941,
"step": 1734
},
{
"epoch": 0.13382799325463743,
"grad_norm": 0.13207705318927765,
"learning_rate": 0.00019157563891178896,
"loss": 1.0998,
"step": 1736
},
{
"epoch": 0.13398217297036857,
"grad_norm": 0.1436687856912613,
"learning_rate": 0.00019156533388293488,
"loss": 1.0723,
"step": 1738
},
{
"epoch": 0.13413635268609975,
"grad_norm": 0.1206304207444191,
"learning_rate": 0.00019155502885408082,
"loss": 1.0279,
"step": 1740
},
{
"epoch": 0.1342905324018309,
"grad_norm": 0.12685900926589966,
"learning_rate": 0.00019154472382522673,
"loss": 1.0683,
"step": 1742
},
{
"epoch": 0.13444471211756204,
"grad_norm": 0.12833228707313538,
"learning_rate": 0.00019153441879637265,
"loss": 1.0904,
"step": 1744
},
{
"epoch": 0.13459889183329318,
"grad_norm": 0.12999312579631805,
"learning_rate": 0.00019152411376751856,
"loss": 1.0492,
"step": 1746
},
{
"epoch": 0.13475307154902433,
"grad_norm": 0.13486912846565247,
"learning_rate": 0.00019151380873866448,
"loss": 1.101,
"step": 1748
},
{
"epoch": 0.13490725126475547,
"grad_norm": 0.12793023884296417,
"learning_rate": 0.0001915035037098104,
"loss": 1.1135,
"step": 1750
},
{
"epoch": 0.13506143098048662,
"grad_norm": 0.12652675807476044,
"learning_rate": 0.0001914931986809563,
"loss": 1.0902,
"step": 1752
},
{
"epoch": 0.1352156106962178,
"grad_norm": 0.12431836873292923,
"learning_rate": 0.00019148289365210222,
"loss": 1.0922,
"step": 1754
},
{
"epoch": 0.13536979041194894,
"grad_norm": 0.13665209710597992,
"learning_rate": 0.00019147258862324814,
"loss": 1.0584,
"step": 1756
},
{
"epoch": 0.13552397012768008,
"grad_norm": 0.1355196088552475,
"learning_rate": 0.00019146228359439405,
"loss": 1.1199,
"step": 1758
},
{
"epoch": 0.13567814984341123,
"grad_norm": 0.14115893840789795,
"learning_rate": 0.00019145197856554,
"loss": 1.0697,
"step": 1760
},
{
"epoch": 0.13583232955914237,
"grad_norm": 0.13009534776210785,
"learning_rate": 0.0001914416735366859,
"loss": 1.1111,
"step": 1762
},
{
"epoch": 0.13598650927487352,
"grad_norm": 0.12280994653701782,
"learning_rate": 0.00019143136850783182,
"loss": 1.0341,
"step": 1764
},
{
"epoch": 0.13614068899060466,
"grad_norm": 0.15171582996845245,
"learning_rate": 0.00019142106347897774,
"loss": 1.1275,
"step": 1766
},
{
"epoch": 0.1362948687063358,
"grad_norm": 0.15258526802062988,
"learning_rate": 0.00019141075845012365,
"loss": 1.0513,
"step": 1768
},
{
"epoch": 0.13644904842206698,
"grad_norm": 0.132346972823143,
"learning_rate": 0.0001914004534212696,
"loss": 1.0878,
"step": 1770
},
{
"epoch": 0.13660322813779813,
"grad_norm": 0.13237041234970093,
"learning_rate": 0.0001913901483924155,
"loss": 1.0845,
"step": 1772
},
{
"epoch": 0.13675740785352927,
"grad_norm": 0.13837209343910217,
"learning_rate": 0.00019137984336356143,
"loss": 1.1221,
"step": 1774
},
{
"epoch": 0.13691158756926042,
"grad_norm": 0.17590375244617462,
"learning_rate": 0.00019136953833470734,
"loss": 1.1963,
"step": 1776
},
{
"epoch": 0.13706576728499156,
"grad_norm": 0.12898488342761993,
"learning_rate": 0.00019135923330585326,
"loss": 1.1306,
"step": 1778
},
{
"epoch": 0.1372199470007227,
"grad_norm": 0.12428785115480423,
"learning_rate": 0.0001913489282769992,
"loss": 1.068,
"step": 1780
},
{
"epoch": 0.13737412671645385,
"grad_norm": 0.12678809463977814,
"learning_rate": 0.0001913386232481451,
"loss": 1.0709,
"step": 1782
},
{
"epoch": 0.13752830643218503,
"grad_norm": 0.1344168782234192,
"learning_rate": 0.00019132831821929103,
"loss": 1.1073,
"step": 1784
},
{
"epoch": 0.13768248614791617,
"grad_norm": 0.14730733633041382,
"learning_rate": 0.00019131801319043694,
"loss": 1.0073,
"step": 1786
},
{
"epoch": 0.13783666586364732,
"grad_norm": 0.13661792874336243,
"learning_rate": 0.00019130770816158286,
"loss": 1.0637,
"step": 1788
},
{
"epoch": 0.13799084557937846,
"grad_norm": 0.1342434138059616,
"learning_rate": 0.0001912974031327288,
"loss": 1.1069,
"step": 1790
},
{
"epoch": 0.1381450252951096,
"grad_norm": 0.11941581219434738,
"learning_rate": 0.00019128709810387471,
"loss": 1.1023,
"step": 1792
},
{
"epoch": 0.13829920501084075,
"grad_norm": 0.13641759753227234,
"learning_rate": 0.00019127679307502063,
"loss": 1.0564,
"step": 1794
},
{
"epoch": 0.1384533847265719,
"grad_norm": 0.11148608475923538,
"learning_rate": 0.00019126648804616654,
"loss": 1.0255,
"step": 1796
},
{
"epoch": 0.13860756444230307,
"grad_norm": 0.1387186199426651,
"learning_rate": 0.00019125618301731246,
"loss": 1.0663,
"step": 1798
},
{
"epoch": 0.13876174415803422,
"grad_norm": 0.12380651384592056,
"learning_rate": 0.00019124587798845837,
"loss": 1.1222,
"step": 1800
},
{
"epoch": 0.13876174415803422,
"eval_loss": 1.0875153541564941,
"eval_runtime": 185.4605,
"eval_samples_per_second": 91.356,
"eval_steps_per_second": 1.429,
"step": 1800
},
{
"epoch": 0.13891592387376536,
"grad_norm": 0.13224369287490845,
"learning_rate": 0.00019123557295960432,
"loss": 1.0821,
"step": 1802
},
{
"epoch": 0.1390701035894965,
"grad_norm": 0.13096244633197784,
"learning_rate": 0.00019122526793075023,
"loss": 1.0097,
"step": 1804
},
{
"epoch": 0.13922428330522765,
"grad_norm": 0.11652527749538422,
"learning_rate": 0.00019121496290189615,
"loss": 1.0517,
"step": 1806
},
{
"epoch": 0.1393784630209588,
"grad_norm": 0.13449358940124512,
"learning_rate": 0.00019120465787304206,
"loss": 1.0915,
"step": 1808
},
{
"epoch": 0.13953264273668994,
"grad_norm": 0.11550068855285645,
"learning_rate": 0.00019119435284418798,
"loss": 1.0568,
"step": 1810
},
{
"epoch": 0.13968682245242112,
"grad_norm": 0.13804587721824646,
"learning_rate": 0.0001911840478153339,
"loss": 1.0933,
"step": 1812
},
{
"epoch": 0.13984100216815226,
"grad_norm": 0.12062159180641174,
"learning_rate": 0.0001911737427864798,
"loss": 1.0517,
"step": 1814
},
{
"epoch": 0.1399951818838834,
"grad_norm": 0.12154779583215714,
"learning_rate": 0.00019116343775762572,
"loss": 1.0955,
"step": 1816
},
{
"epoch": 0.14014936159961455,
"grad_norm": 0.11615799367427826,
"learning_rate": 0.00019115313272877164,
"loss": 0.968,
"step": 1818
},
{
"epoch": 0.1403035413153457,
"grad_norm": 0.1207037940621376,
"learning_rate": 0.00019114282769991755,
"loss": 1.0896,
"step": 1820
},
{
"epoch": 0.14045772103107684,
"grad_norm": 0.12750887870788574,
"learning_rate": 0.0001911325226710635,
"loss": 1.065,
"step": 1822
},
{
"epoch": 0.140611900746808,
"grad_norm": 0.16391952335834503,
"learning_rate": 0.0001911222176422094,
"loss": 1.0232,
"step": 1824
},
{
"epoch": 0.14076608046253913,
"grad_norm": 0.14626921713352203,
"learning_rate": 0.00019111191261335532,
"loss": 1.0375,
"step": 1826
},
{
"epoch": 0.1409202601782703,
"grad_norm": 0.12393996119499207,
"learning_rate": 0.00019110160758450124,
"loss": 1.0345,
"step": 1828
},
{
"epoch": 0.14107443989400145,
"grad_norm": 0.13275925815105438,
"learning_rate": 0.00019109130255564715,
"loss": 1.071,
"step": 1830
},
{
"epoch": 0.1412286196097326,
"grad_norm": 0.1255485862493515,
"learning_rate": 0.0001910809975267931,
"loss": 1.1026,
"step": 1832
},
{
"epoch": 0.14138279932546374,
"grad_norm": 0.13399668037891388,
"learning_rate": 0.000191070692497939,
"loss": 1.11,
"step": 1834
},
{
"epoch": 0.1415369790411949,
"grad_norm": 0.13084925711154938,
"learning_rate": 0.00019106038746908492,
"loss": 1.0528,
"step": 1836
},
{
"epoch": 0.14169115875692603,
"grad_norm": 0.15695689618587494,
"learning_rate": 0.00019105008244023084,
"loss": 1.1336,
"step": 1838
},
{
"epoch": 0.14184533847265718,
"grad_norm": 0.13630808889865875,
"learning_rate": 0.00019103977741137675,
"loss": 1.0767,
"step": 1840
},
{
"epoch": 0.14199951818838835,
"grad_norm": 0.11874844878911972,
"learning_rate": 0.0001910294723825227,
"loss": 1.0511,
"step": 1842
},
{
"epoch": 0.1421536979041195,
"grad_norm": 0.11898507922887802,
"learning_rate": 0.0001910191673536686,
"loss": 1.0866,
"step": 1844
},
{
"epoch": 0.14230787761985064,
"grad_norm": 0.1393211930990219,
"learning_rate": 0.00019100886232481453,
"loss": 1.0553,
"step": 1846
},
{
"epoch": 0.1424620573355818,
"grad_norm": 0.1382310539484024,
"learning_rate": 0.00019099855729596044,
"loss": 1.07,
"step": 1848
},
{
"epoch": 0.14261623705131293,
"grad_norm": 0.1471824198961258,
"learning_rate": 0.00019098825226710636,
"loss": 1.0893,
"step": 1850
},
{
"epoch": 0.14277041676704408,
"grad_norm": 0.12706084549427032,
"learning_rate": 0.0001909779472382523,
"loss": 1.0848,
"step": 1852
},
{
"epoch": 0.14292459648277522,
"grad_norm": 0.1324569135904312,
"learning_rate": 0.0001909676422093982,
"loss": 1.024,
"step": 1854
},
{
"epoch": 0.1430787761985064,
"grad_norm": 0.11245544254779816,
"learning_rate": 0.00019095733718054413,
"loss": 1.0802,
"step": 1856
},
{
"epoch": 0.14323295591423754,
"grad_norm": 0.15419217944145203,
"learning_rate": 0.00019094703215169004,
"loss": 1.1101,
"step": 1858
},
{
"epoch": 0.1433871356299687,
"grad_norm": 0.1071443036198616,
"learning_rate": 0.00019093672712283596,
"loss": 1.0576,
"step": 1860
},
{
"epoch": 0.14354131534569983,
"grad_norm": 0.1341090053319931,
"learning_rate": 0.00019092642209398187,
"loss": 1.0606,
"step": 1862
},
{
"epoch": 0.14369549506143098,
"grad_norm": 0.11848092079162598,
"learning_rate": 0.0001909161170651278,
"loss": 1.0714,
"step": 1864
},
{
"epoch": 0.14384967477716212,
"grad_norm": 0.12697815895080566,
"learning_rate": 0.0001909058120362737,
"loss": 1.092,
"step": 1866
},
{
"epoch": 0.14400385449289327,
"grad_norm": 0.11891257762908936,
"learning_rate": 0.00019089550700741962,
"loss": 0.9649,
"step": 1868
},
{
"epoch": 0.14415803420862444,
"grad_norm": 0.12616439163684845,
"learning_rate": 0.00019088520197856553,
"loss": 1.0962,
"step": 1870
},
{
"epoch": 0.1443122139243556,
"grad_norm": 0.12141067534685135,
"learning_rate": 0.00019087489694971147,
"loss": 1.0838,
"step": 1872
},
{
"epoch": 0.14446639364008673,
"grad_norm": 0.13279564678668976,
"learning_rate": 0.0001908645919208574,
"loss": 1.0484,
"step": 1874
},
{
"epoch": 0.14462057335581788,
"grad_norm": 0.15748505294322968,
"learning_rate": 0.0001908542868920033,
"loss": 1.1433,
"step": 1876
},
{
"epoch": 0.14477475307154902,
"grad_norm": 0.11593475937843323,
"learning_rate": 0.00019084398186314922,
"loss": 1.1483,
"step": 1878
},
{
"epoch": 0.14492893278728017,
"grad_norm": 0.14499489963054657,
"learning_rate": 0.00019083367683429513,
"loss": 1.0782,
"step": 1880
},
{
"epoch": 0.1450831125030113,
"grad_norm": 0.13570410013198853,
"learning_rate": 0.00019082337180544105,
"loss": 1.0989,
"step": 1882
},
{
"epoch": 0.14523729221874246,
"grad_norm": 0.12810774147510529,
"learning_rate": 0.000190813066776587,
"loss": 1.0374,
"step": 1884
},
{
"epoch": 0.14539147193447363,
"grad_norm": 0.11781581491231918,
"learning_rate": 0.0001908027617477329,
"loss": 1.0796,
"step": 1886
},
{
"epoch": 0.14554565165020478,
"grad_norm": 0.12243229150772095,
"learning_rate": 0.00019079245671887882,
"loss": 1.0477,
"step": 1888
},
{
"epoch": 0.14569983136593592,
"grad_norm": 0.1385030299425125,
"learning_rate": 0.00019078215169002474,
"loss": 1.0349,
"step": 1890
},
{
"epoch": 0.14585401108166707,
"grad_norm": 0.12011386454105377,
"learning_rate": 0.00019077184666117065,
"loss": 1.0718,
"step": 1892
},
{
"epoch": 0.1460081907973982,
"grad_norm": 0.12646062672138214,
"learning_rate": 0.0001907615416323166,
"loss": 1.1228,
"step": 1894
},
{
"epoch": 0.14616237051312936,
"grad_norm": 0.1284620612859726,
"learning_rate": 0.0001907512366034625,
"loss": 1.079,
"step": 1896
},
{
"epoch": 0.1463165502288605,
"grad_norm": 0.15374581515789032,
"learning_rate": 0.00019074093157460842,
"loss": 1.1147,
"step": 1898
},
{
"epoch": 0.14647072994459168,
"grad_norm": 0.1325882524251938,
"learning_rate": 0.00019073062654575434,
"loss": 1.0404,
"step": 1900
},
{
"epoch": 0.14647072994459168,
"eval_loss": 1.0869932174682617,
"eval_runtime": 185.4754,
"eval_samples_per_second": 91.349,
"eval_steps_per_second": 1.429,
"step": 1900
},
{
"epoch": 0.14662490966032282,
"grad_norm": 0.14041611552238464,
"learning_rate": 0.00019072032151690025,
"loss": 1.095,
"step": 1902
},
{
"epoch": 0.14677908937605397,
"grad_norm": 0.14162160456180573,
"learning_rate": 0.0001907100164880462,
"loss": 1.1714,
"step": 1904
},
{
"epoch": 0.1469332690917851,
"grad_norm": 0.12077832221984863,
"learning_rate": 0.0001906997114591921,
"loss": 1.1109,
"step": 1906
},
{
"epoch": 0.14708744880751626,
"grad_norm": 0.1738968789577484,
"learning_rate": 0.00019068940643033802,
"loss": 1.0838,
"step": 1908
},
{
"epoch": 0.1472416285232474,
"grad_norm": 0.13948039710521698,
"learning_rate": 0.00019067910140148394,
"loss": 1.0494,
"step": 1910
},
{
"epoch": 0.14739580823897855,
"grad_norm": 0.21179239451885223,
"learning_rate": 0.00019066879637262985,
"loss": 1.0962,
"step": 1912
},
{
"epoch": 0.14754998795470972,
"grad_norm": 0.12927787005901337,
"learning_rate": 0.00019065849134377577,
"loss": 1.1113,
"step": 1914
},
{
"epoch": 0.14770416767044087,
"grad_norm": 0.1296701431274414,
"learning_rate": 0.00019064818631492168,
"loss": 1.0603,
"step": 1916
},
{
"epoch": 0.147858347386172,
"grad_norm": 0.1282590925693512,
"learning_rate": 0.0001906378812860676,
"loss": 1.0594,
"step": 1918
},
{
"epoch": 0.14801252710190316,
"grad_norm": 0.13304758071899414,
"learning_rate": 0.0001906275762572135,
"loss": 1.0784,
"step": 1920
},
{
"epoch": 0.1481667068176343,
"grad_norm": 0.15661965310573578,
"learning_rate": 0.00019061727122835943,
"loss": 1.008,
"step": 1922
},
{
"epoch": 0.14832088653336545,
"grad_norm": 0.12986873090267181,
"learning_rate": 0.00019060696619950537,
"loss": 1.0788,
"step": 1924
},
{
"epoch": 0.1484750662490966,
"grad_norm": 0.1128251776099205,
"learning_rate": 0.00019059666117065128,
"loss": 1.1449,
"step": 1926
},
{
"epoch": 0.14862924596482774,
"grad_norm": 0.13722160458564758,
"learning_rate": 0.0001905863561417972,
"loss": 1.0914,
"step": 1928
},
{
"epoch": 0.1487834256805589,
"grad_norm": 0.1507786512374878,
"learning_rate": 0.00019057605111294311,
"loss": 1.0694,
"step": 1930
},
{
"epoch": 0.14893760539629006,
"grad_norm": 0.1368752121925354,
"learning_rate": 0.00019056574608408903,
"loss": 1.0417,
"step": 1932
},
{
"epoch": 0.1490917851120212,
"grad_norm": 0.12566259503364563,
"learning_rate": 0.00019055544105523497,
"loss": 1.0853,
"step": 1934
},
{
"epoch": 0.14924596482775235,
"grad_norm": 0.12362397462129593,
"learning_rate": 0.0001905451360263809,
"loss": 1.1136,
"step": 1936
},
{
"epoch": 0.1494001445434835,
"grad_norm": 0.12472514808177948,
"learning_rate": 0.0001905348309975268,
"loss": 1.0628,
"step": 1938
},
{
"epoch": 0.14955432425921464,
"grad_norm": 0.1355161964893341,
"learning_rate": 0.00019052452596867272,
"loss": 1.1211,
"step": 1940
},
{
"epoch": 0.14970850397494578,
"grad_norm": 0.13438721001148224,
"learning_rate": 0.00019051422093981863,
"loss": 1.0758,
"step": 1942
},
{
"epoch": 0.14986268369067696,
"grad_norm": 0.11768204718828201,
"learning_rate": 0.00019050391591096457,
"loss": 1.0533,
"step": 1944
},
{
"epoch": 0.1500168634064081,
"grad_norm": 0.13892577588558197,
"learning_rate": 0.0001904936108821105,
"loss": 1.1076,
"step": 1946
},
{
"epoch": 0.15017104312213925,
"grad_norm": 0.1532358080148697,
"learning_rate": 0.0001904833058532564,
"loss": 1.0706,
"step": 1948
},
{
"epoch": 0.1503252228378704,
"grad_norm": 0.13364464044570923,
"learning_rate": 0.00019047300082440232,
"loss": 1.1322,
"step": 1950
},
{
"epoch": 0.15047940255360154,
"grad_norm": 0.12663134932518005,
"learning_rate": 0.00019046269579554823,
"loss": 1.0749,
"step": 1952
},
{
"epoch": 0.15063358226933268,
"grad_norm": 0.1297607123851776,
"learning_rate": 0.00019045239076669417,
"loss": 1.0594,
"step": 1954
},
{
"epoch": 0.15078776198506383,
"grad_norm": 0.11931920051574707,
"learning_rate": 0.0001904420857378401,
"loss": 1.0522,
"step": 1956
},
{
"epoch": 0.150941941700795,
"grad_norm": 0.1334810107946396,
"learning_rate": 0.000190431780708986,
"loss": 1.0674,
"step": 1958
},
{
"epoch": 0.15109612141652615,
"grad_norm": 0.12633340060710907,
"learning_rate": 0.00019042147568013192,
"loss": 1.0139,
"step": 1960
},
{
"epoch": 0.1512503011322573,
"grad_norm": 0.12485836446285248,
"learning_rate": 0.00019041117065127783,
"loss": 1.0288,
"step": 1962
},
{
"epoch": 0.15140448084798844,
"grad_norm": 0.10940799117088318,
"learning_rate": 0.00019040086562242375,
"loss": 1.0475,
"step": 1964
},
{
"epoch": 0.15155866056371958,
"grad_norm": 0.12229325622320175,
"learning_rate": 0.00019039056059356966,
"loss": 1.0628,
"step": 1966
},
{
"epoch": 0.15171284027945073,
"grad_norm": 0.14333505928516388,
"learning_rate": 0.00019038025556471558,
"loss": 1.0423,
"step": 1968
},
{
"epoch": 0.15186701999518187,
"grad_norm": 0.12773017585277557,
"learning_rate": 0.0001903699505358615,
"loss": 1.1283,
"step": 1970
},
{
"epoch": 0.15202119971091305,
"grad_norm": 0.11913473904132843,
"learning_rate": 0.0001903596455070074,
"loss": 1.0646,
"step": 1972
},
{
"epoch": 0.1521753794266442,
"grad_norm": 0.13321518898010254,
"learning_rate": 0.00019034934047815332,
"loss": 1.0476,
"step": 1974
},
{
"epoch": 0.15232955914237534,
"grad_norm": 0.1362799108028412,
"learning_rate": 0.00019033903544929927,
"loss": 1.0937,
"step": 1976
},
{
"epoch": 0.15248373885810648,
"grad_norm": 0.13804180920124054,
"learning_rate": 0.00019032873042044518,
"loss": 1.113,
"step": 1978
},
{
"epoch": 0.15263791857383763,
"grad_norm": 0.1774570494890213,
"learning_rate": 0.0001903184253915911,
"loss": 1.0795,
"step": 1980
},
{
"epoch": 0.15279209828956877,
"grad_norm": 0.13106994330883026,
"learning_rate": 0.000190308120362737,
"loss": 1.098,
"step": 1982
},
{
"epoch": 0.15294627800529992,
"grad_norm": 0.14435411989688873,
"learning_rate": 0.00019029781533388293,
"loss": 1.0814,
"step": 1984
},
{
"epoch": 0.15310045772103106,
"grad_norm": 0.13178013265132904,
"learning_rate": 0.00019028751030502887,
"loss": 1.1002,
"step": 1986
},
{
"epoch": 0.15325463743676224,
"grad_norm": 0.1283218264579773,
"learning_rate": 0.00019027720527617478,
"loss": 1.0749,
"step": 1988
},
{
"epoch": 0.15340881715249338,
"grad_norm": 0.12113723158836365,
"learning_rate": 0.0001902669002473207,
"loss": 1.0831,
"step": 1990
},
{
"epoch": 0.15356299686822453,
"grad_norm": 0.12649892270565033,
"learning_rate": 0.0001902565952184666,
"loss": 1.0166,
"step": 1992
},
{
"epoch": 0.15371717658395567,
"grad_norm": 0.12823793292045593,
"learning_rate": 0.00019024629018961253,
"loss": 1.0273,
"step": 1994
},
{
"epoch": 0.15387135629968682,
"grad_norm": 0.1291527897119522,
"learning_rate": 0.00019023598516075847,
"loss": 1.1092,
"step": 1996
},
{
"epoch": 0.15402553601541796,
"grad_norm": 0.12588894367218018,
"learning_rate": 0.00019022568013190438,
"loss": 1.0627,
"step": 1998
},
{
"epoch": 0.1541797157311491,
"grad_norm": 0.12996312975883484,
"learning_rate": 0.0001902153751030503,
"loss": 1.1196,
"step": 2000
},
{
"epoch": 0.1541797157311491,
"eval_loss": 1.0863893032073975,
"eval_runtime": 185.3254,
"eval_samples_per_second": 91.423,
"eval_steps_per_second": 1.43,
"step": 2000
},
{
"epoch": 0.15433389544688028,
"grad_norm": 0.14361834526062012,
"learning_rate": 0.00019020507007419621,
"loss": 1.1151,
"step": 2002
},
{
"epoch": 0.15448807516261143,
"grad_norm": 0.12650837004184723,
"learning_rate": 0.00019019476504534213,
"loss": 1.1155,
"step": 2004
},
{
"epoch": 0.15464225487834257,
"grad_norm": 0.13820499181747437,
"learning_rate": 0.00019018446001648807,
"loss": 1.1243,
"step": 2006
},
{
"epoch": 0.15479643459407372,
"grad_norm": 0.13205693662166595,
"learning_rate": 0.00019017415498763399,
"loss": 1.0626,
"step": 2008
},
{
"epoch": 0.15495061430980486,
"grad_norm": 0.13930106163024902,
"learning_rate": 0.0001901638499587799,
"loss": 1.1105,
"step": 2010
},
{
"epoch": 0.155104794025536,
"grad_norm": 0.14711922407150269,
"learning_rate": 0.00019015354492992582,
"loss": 1.0556,
"step": 2012
},
{
"epoch": 0.15525897374126715,
"grad_norm": 0.11909156292676926,
"learning_rate": 0.00019014323990107173,
"loss": 1.1025,
"step": 2014
},
{
"epoch": 0.15541315345699833,
"grad_norm": 0.14099714159965515,
"learning_rate": 0.00019013293487221767,
"loss": 1.064,
"step": 2016
},
{
"epoch": 0.15556733317272947,
"grad_norm": 0.11500216275453568,
"learning_rate": 0.0001901226298433636,
"loss": 1.1196,
"step": 2018
},
{
"epoch": 0.15572151288846062,
"grad_norm": 0.12341683357954025,
"learning_rate": 0.0001901123248145095,
"loss": 1.0625,
"step": 2020
},
{
"epoch": 0.15587569260419176,
"grad_norm": 0.1390669196844101,
"learning_rate": 0.00019010201978565542,
"loss": 1.0526,
"step": 2022
},
{
"epoch": 0.1560298723199229,
"grad_norm": 0.13482992351055145,
"learning_rate": 0.00019009171475680133,
"loss": 1.1074,
"step": 2024
},
{
"epoch": 0.15618405203565405,
"grad_norm": 0.12277045845985413,
"learning_rate": 0.00019008140972794725,
"loss": 1.0648,
"step": 2026
},
{
"epoch": 0.1563382317513852,
"grad_norm": 0.13579949736595154,
"learning_rate": 0.00019007110469909316,
"loss": 1.1235,
"step": 2028
},
{
"epoch": 0.15649241146711637,
"grad_norm": 0.14128637313842773,
"learning_rate": 0.00019006079967023908,
"loss": 1.0442,
"step": 2030
},
{
"epoch": 0.15664659118284752,
"grad_norm": 0.13722474873065948,
"learning_rate": 0.000190050494641385,
"loss": 1.1215,
"step": 2032
},
{
"epoch": 0.15680077089857866,
"grad_norm": 0.13500674068927765,
"learning_rate": 0.0001900401896125309,
"loss": 1.0776,
"step": 2034
},
{
"epoch": 0.1569549506143098,
"grad_norm": 0.11917294561862946,
"learning_rate": 0.00019002988458367685,
"loss": 1.0698,
"step": 2036
},
{
"epoch": 0.15710913033004095,
"grad_norm": 0.12245581299066544,
"learning_rate": 0.00019001957955482276,
"loss": 1.0166,
"step": 2038
},
{
"epoch": 0.1572633100457721,
"grad_norm": 0.12556669116020203,
"learning_rate": 0.00019000927452596868,
"loss": 1.0846,
"step": 2040
},
{
"epoch": 0.15741748976150324,
"grad_norm": 0.13316373527050018,
"learning_rate": 0.0001899989694971146,
"loss": 1.0566,
"step": 2042
},
{
"epoch": 0.1575716694772344,
"grad_norm": 0.1296815425157547,
"learning_rate": 0.0001899886644682605,
"loss": 1.0824,
"step": 2044
},
{
"epoch": 0.15772584919296556,
"grad_norm": 0.1288246214389801,
"learning_rate": 0.00018997835943940645,
"loss": 1.0974,
"step": 2046
},
{
"epoch": 0.1578800289086967,
"grad_norm": 0.1185479462146759,
"learning_rate": 0.00018996805441055237,
"loss": 1.1443,
"step": 2048
},
{
"epoch": 0.15803420862442785,
"grad_norm": 0.12504369020462036,
"learning_rate": 0.00018995774938169828,
"loss": 1.0899,
"step": 2050
},
{
"epoch": 0.158188388340159,
"grad_norm": 0.1266452521085739,
"learning_rate": 0.0001899474443528442,
"loss": 1.0654,
"step": 2052
},
{
"epoch": 0.15834256805589014,
"grad_norm": 0.13447126746177673,
"learning_rate": 0.0001899371393239901,
"loss": 1.0649,
"step": 2054
},
{
"epoch": 0.1584967477716213,
"grad_norm": 0.1446131467819214,
"learning_rate": 0.00018992683429513603,
"loss": 1.1439,
"step": 2056
},
{
"epoch": 0.15865092748735243,
"grad_norm": 0.12688389420509338,
"learning_rate": 0.00018991652926628197,
"loss": 1.0262,
"step": 2058
},
{
"epoch": 0.1588051072030836,
"grad_norm": 0.12581713497638702,
"learning_rate": 0.00018990622423742788,
"loss": 1.0723,
"step": 2060
},
{
"epoch": 0.15895928691881475,
"grad_norm": 0.15745951235294342,
"learning_rate": 0.0001898959192085738,
"loss": 1.1038,
"step": 2062
},
{
"epoch": 0.1591134666345459,
"grad_norm": 0.14457587897777557,
"learning_rate": 0.0001898856141797197,
"loss": 1.1072,
"step": 2064
},
{
"epoch": 0.15926764635027704,
"grad_norm": 0.11454683542251587,
"learning_rate": 0.00018987530915086563,
"loss": 1.0605,
"step": 2066
},
{
"epoch": 0.1594218260660082,
"grad_norm": 0.1137547716498375,
"learning_rate": 0.00018986500412201157,
"loss": 1.0405,
"step": 2068
},
{
"epoch": 0.15957600578173933,
"grad_norm": 0.1220378428697586,
"learning_rate": 0.00018985469909315748,
"loss": 1.086,
"step": 2070
},
{
"epoch": 0.15973018549747048,
"grad_norm": 0.13579098880290985,
"learning_rate": 0.0001898443940643034,
"loss": 1.0334,
"step": 2072
},
{
"epoch": 0.15988436521320165,
"grad_norm": 0.1529407948255539,
"learning_rate": 0.00018983408903544931,
"loss": 1.0614,
"step": 2074
},
{
"epoch": 0.1600385449289328,
"grad_norm": 0.13769444823265076,
"learning_rate": 0.00018982378400659523,
"loss": 1.1212,
"step": 2076
},
{
"epoch": 0.16019272464466394,
"grad_norm": 0.12095335125923157,
"learning_rate": 0.00018981347897774114,
"loss": 1.047,
"step": 2078
},
{
"epoch": 0.1603469043603951,
"grad_norm": 0.12483233958482742,
"learning_rate": 0.00018980317394888706,
"loss": 1.0808,
"step": 2080
},
{
"epoch": 0.16050108407612623,
"grad_norm": 0.12451382726430893,
"learning_rate": 0.00018979286892003297,
"loss": 1.1259,
"step": 2082
},
{
"epoch": 0.16065526379185738,
"grad_norm": 0.12540730834007263,
"learning_rate": 0.0001897825638911789,
"loss": 1.0761,
"step": 2084
},
{
"epoch": 0.16080944350758852,
"grad_norm": 0.12948516011238098,
"learning_rate": 0.0001897722588623248,
"loss": 1.0621,
"step": 2086
},
{
"epoch": 0.16096362322331967,
"grad_norm": 0.1349886953830719,
"learning_rate": 0.00018976195383347075,
"loss": 1.0549,
"step": 2088
},
{
"epoch": 0.16111780293905084,
"grad_norm": 0.1249813437461853,
"learning_rate": 0.00018975164880461666,
"loss": 1.0828,
"step": 2090
},
{
"epoch": 0.161271982654782,
"grad_norm": 0.1299104243516922,
"learning_rate": 0.00018974134377576258,
"loss": 1.097,
"step": 2092
},
{
"epoch": 0.16142616237051313,
"grad_norm": 0.13004744052886963,
"learning_rate": 0.0001897310387469085,
"loss": 1.0417,
"step": 2094
},
{
"epoch": 0.16158034208624428,
"grad_norm": 0.11553830653429031,
"learning_rate": 0.0001897207337180544,
"loss": 1.0563,
"step": 2096
},
{
"epoch": 0.16173452180197542,
"grad_norm": 0.12000396102666855,
"learning_rate": 0.00018971042868920035,
"loss": 1.077,
"step": 2098
},
{
"epoch": 0.16188870151770657,
"grad_norm": 0.13707685470581055,
"learning_rate": 0.00018970012366034626,
"loss": 1.0994,
"step": 2100
},
{
"epoch": 0.16188870151770657,
"eval_loss": 1.0858707427978516,
"eval_runtime": 185.7188,
"eval_samples_per_second": 91.229,
"eval_steps_per_second": 1.427,
"step": 2100
}
],
"logging_steps": 2,
"max_steps": 38916,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.132999221824717e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}