multilingual-e5-small-quality-v3 / trainer_state.json
agentlans's picture
Upload 12 files
3e3bad9 verified
{
"best_global_step": 758667,
"best_metric": 0.06412914395332336,
"best_model_checkpoint": "/media/user/Expansion1/multilingual-e5-small-aligned-v2-text-quality-v3/checkpoint-758667",
"epoch": 10.0,
"eval_steps": 500,
"global_step": 1083810,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004613354739299324,
"grad_norm": 0.9464718699455261,
"learning_rate": 4.99769793598509e-05,
"loss": 0.2108,
"num_input_tokens_seen": 512000,
"step": 500
},
{
"epoch": 0.009226709478598648,
"grad_norm": 1.6402217149734497,
"learning_rate": 4.99539125861544e-05,
"loss": 0.1148,
"num_input_tokens_seen": 1024000,
"step": 1000
},
{
"epoch": 0.013840064217897971,
"grad_norm": 2.3964197635650635,
"learning_rate": 4.9930845812457905e-05,
"loss": 0.115,
"num_input_tokens_seen": 1536000,
"step": 1500
},
{
"epoch": 0.018453418957197296,
"grad_norm": 1.5508780479431152,
"learning_rate": 4.990777903876141e-05,
"loss": 0.0986,
"num_input_tokens_seen": 2048000,
"step": 2000
},
{
"epoch": 0.02306677369649662,
"grad_norm": 1.0917489528656006,
"learning_rate": 4.9884712265064913e-05,
"loss": 0.1006,
"num_input_tokens_seen": 2560000,
"step": 2500
},
{
"epoch": 0.027680128435795943,
"grad_norm": 3.2608118057250977,
"learning_rate": 4.9861645491368414e-05,
"loss": 0.0966,
"num_input_tokens_seen": 3072000,
"step": 3000
},
{
"epoch": 0.03229348317509526,
"grad_norm": 0.6695080995559692,
"learning_rate": 4.983857871767192e-05,
"loss": 0.0966,
"num_input_tokens_seen": 3584000,
"step": 3500
},
{
"epoch": 0.03690683791439459,
"grad_norm": 0.9232053756713867,
"learning_rate": 4.981551194397542e-05,
"loss": 0.0937,
"num_input_tokens_seen": 4096000,
"step": 4000
},
{
"epoch": 0.041520192653693916,
"grad_norm": 1.5442851781845093,
"learning_rate": 4.979244517027893e-05,
"loss": 0.0966,
"num_input_tokens_seen": 4608000,
"step": 4500
},
{
"epoch": 0.04613354739299324,
"grad_norm": 1.1777746677398682,
"learning_rate": 4.976937839658243e-05,
"loss": 0.0928,
"num_input_tokens_seen": 5120000,
"step": 5000
},
{
"epoch": 0.05074690213229256,
"grad_norm": 1.1882743835449219,
"learning_rate": 4.974631162288593e-05,
"loss": 0.0982,
"num_input_tokens_seen": 5632000,
"step": 5500
},
{
"epoch": 0.055360256871591886,
"grad_norm": 1.9017492532730103,
"learning_rate": 4.972324484918944e-05,
"loss": 0.0968,
"num_input_tokens_seen": 6144000,
"step": 6000
},
{
"epoch": 0.05997361161089121,
"grad_norm": 0.9373461008071899,
"learning_rate": 4.970017807549294e-05,
"loss": 0.0942,
"num_input_tokens_seen": 6656000,
"step": 6500
},
{
"epoch": 0.06458696635019053,
"grad_norm": 1.4917376041412354,
"learning_rate": 4.967711130179644e-05,
"loss": 0.0935,
"num_input_tokens_seen": 7168000,
"step": 7000
},
{
"epoch": 0.06920032108948986,
"grad_norm": 0.534630298614502,
"learning_rate": 4.9654044528099946e-05,
"loss": 0.0879,
"num_input_tokens_seen": 7680000,
"step": 7500
},
{
"epoch": 0.07381367582878919,
"grad_norm": 1.9700461626052856,
"learning_rate": 4.9630977754403454e-05,
"loss": 0.0913,
"num_input_tokens_seen": 8192000,
"step": 8000
},
{
"epoch": 0.0784270305680885,
"grad_norm": 2.11916446685791,
"learning_rate": 4.960791098070695e-05,
"loss": 0.0929,
"num_input_tokens_seen": 8704000,
"step": 8500
},
{
"epoch": 0.08304038530738783,
"grad_norm": 1.961242437362671,
"learning_rate": 4.9584844207010455e-05,
"loss": 0.0883,
"num_input_tokens_seen": 9216000,
"step": 9000
},
{
"epoch": 0.08765374004668715,
"grad_norm": 1.5819107294082642,
"learning_rate": 4.956177743331396e-05,
"loss": 0.0849,
"num_input_tokens_seen": 9728000,
"step": 9500
},
{
"epoch": 0.09226709478598648,
"grad_norm": 0.8099465370178223,
"learning_rate": 4.953871065961746e-05,
"loss": 0.0925,
"num_input_tokens_seen": 10240000,
"step": 10000
},
{
"epoch": 0.0968804495252858,
"grad_norm": 0.8762685656547546,
"learning_rate": 4.9515643885920963e-05,
"loss": 0.0867,
"num_input_tokens_seen": 10752000,
"step": 10500
},
{
"epoch": 0.10149380426458512,
"grad_norm": 2.166046142578125,
"learning_rate": 4.949257711222447e-05,
"loss": 0.0906,
"num_input_tokens_seen": 11264000,
"step": 11000
},
{
"epoch": 0.10610715900388444,
"grad_norm": 0.9908414483070374,
"learning_rate": 4.946951033852797e-05,
"loss": 0.0989,
"num_input_tokens_seen": 11776000,
"step": 11500
},
{
"epoch": 0.11072051374318377,
"grad_norm": 0.9543777704238892,
"learning_rate": 4.944644356483147e-05,
"loss": 0.0917,
"num_input_tokens_seen": 12288000,
"step": 12000
},
{
"epoch": 0.11533386848248309,
"grad_norm": 2.302893877029419,
"learning_rate": 4.942337679113498e-05,
"loss": 0.0906,
"num_input_tokens_seen": 12800000,
"step": 12500
},
{
"epoch": 0.11994722322178242,
"grad_norm": 1.214758038520813,
"learning_rate": 4.940031001743849e-05,
"loss": 0.0831,
"num_input_tokens_seen": 13312000,
"step": 13000
},
{
"epoch": 0.12456057796108173,
"grad_norm": 1.4494785070419312,
"learning_rate": 4.937724324374199e-05,
"loss": 0.0949,
"num_input_tokens_seen": 13824000,
"step": 13500
},
{
"epoch": 0.12917393270038105,
"grad_norm": 1.3759499788284302,
"learning_rate": 4.935417647004549e-05,
"loss": 0.0775,
"num_input_tokens_seen": 14336000,
"step": 14000
},
{
"epoch": 0.13378728743968038,
"grad_norm": 1.4409326314926147,
"learning_rate": 4.9331109696348995e-05,
"loss": 0.0874,
"num_input_tokens_seen": 14848000,
"step": 14500
},
{
"epoch": 0.1384006421789797,
"grad_norm": 0.6916935443878174,
"learning_rate": 4.9308042922652496e-05,
"loss": 0.0888,
"num_input_tokens_seen": 15360000,
"step": 15000
},
{
"epoch": 0.14301399691827904,
"grad_norm": 2.6819546222686768,
"learning_rate": 4.9284976148956e-05,
"loss": 0.0866,
"num_input_tokens_seen": 15872000,
"step": 15500
},
{
"epoch": 0.14762735165757837,
"grad_norm": 2.243403434753418,
"learning_rate": 4.9261909375259504e-05,
"loss": 0.0858,
"num_input_tokens_seen": 16384000,
"step": 16000
},
{
"epoch": 0.15224070639687767,
"grad_norm": 0.6077441573143005,
"learning_rate": 4.9238842601563004e-05,
"loss": 0.0829,
"num_input_tokens_seen": 16896000,
"step": 16500
},
{
"epoch": 0.156854061136177,
"grad_norm": 0.7938207387924194,
"learning_rate": 4.921577582786651e-05,
"loss": 0.0807,
"num_input_tokens_seen": 17408000,
"step": 17000
},
{
"epoch": 0.16146741587547633,
"grad_norm": 1.7776683568954468,
"learning_rate": 4.919270905417002e-05,
"loss": 0.0801,
"num_input_tokens_seen": 17920000,
"step": 17500
},
{
"epoch": 0.16608077061477566,
"grad_norm": 0.9043099880218506,
"learning_rate": 4.916964228047351e-05,
"loss": 0.0806,
"num_input_tokens_seen": 18432000,
"step": 18000
},
{
"epoch": 0.17069412535407497,
"grad_norm": 3.0099849700927734,
"learning_rate": 4.914657550677702e-05,
"loss": 0.0801,
"num_input_tokens_seen": 18944000,
"step": 18500
},
{
"epoch": 0.1753074800933743,
"grad_norm": 1.3632686138153076,
"learning_rate": 4.912350873308053e-05,
"loss": 0.0858,
"num_input_tokens_seen": 19456000,
"step": 19000
},
{
"epoch": 0.17992083483267363,
"grad_norm": 1.3890104293823242,
"learning_rate": 4.910044195938403e-05,
"loss": 0.0807,
"num_input_tokens_seen": 19968000,
"step": 19500
},
{
"epoch": 0.18453418957197296,
"grad_norm": 1.393978476524353,
"learning_rate": 4.907737518568753e-05,
"loss": 0.09,
"num_input_tokens_seen": 20480000,
"step": 20000
},
{
"epoch": 0.18914754431127226,
"grad_norm": 0.9538819193840027,
"learning_rate": 4.9054308411991036e-05,
"loss": 0.0862,
"num_input_tokens_seen": 20992000,
"step": 20500
},
{
"epoch": 0.1937608990505716,
"grad_norm": 1.6974983215332031,
"learning_rate": 4.903124163829454e-05,
"loss": 0.0778,
"num_input_tokens_seen": 21504000,
"step": 21000
},
{
"epoch": 0.19837425378987092,
"grad_norm": 0.43043065071105957,
"learning_rate": 4.900817486459804e-05,
"loss": 0.0927,
"num_input_tokens_seen": 22016000,
"step": 21500
},
{
"epoch": 0.20298760852917025,
"grad_norm": 0.9475088119506836,
"learning_rate": 4.8985108090901545e-05,
"loss": 0.0813,
"num_input_tokens_seen": 22528000,
"step": 22000
},
{
"epoch": 0.20760096326846958,
"grad_norm": 3.547081470489502,
"learning_rate": 4.8962041317205045e-05,
"loss": 0.0849,
"num_input_tokens_seen": 23040000,
"step": 22500
},
{
"epoch": 0.21221431800776888,
"grad_norm": 1.2342774868011475,
"learning_rate": 4.893897454350855e-05,
"loss": 0.0831,
"num_input_tokens_seen": 23552000,
"step": 23000
},
{
"epoch": 0.2168276727470682,
"grad_norm": 2.133857488632202,
"learning_rate": 4.891590776981205e-05,
"loss": 0.0774,
"num_input_tokens_seen": 24064000,
"step": 23500
},
{
"epoch": 0.22144102748636754,
"grad_norm": 2.0566883087158203,
"learning_rate": 4.889284099611556e-05,
"loss": 0.0778,
"num_input_tokens_seen": 24576000,
"step": 24000
},
{
"epoch": 0.22605438222566687,
"grad_norm": 0.5913178324699402,
"learning_rate": 4.886977422241906e-05,
"loss": 0.0811,
"num_input_tokens_seen": 25088000,
"step": 24500
},
{
"epoch": 0.23066773696496617,
"grad_norm": 1.9674791097640991,
"learning_rate": 4.884670744872256e-05,
"loss": 0.0743,
"num_input_tokens_seen": 25600000,
"step": 25000
},
{
"epoch": 0.2352810917042655,
"grad_norm": 0.5584122538566589,
"learning_rate": 4.882364067502607e-05,
"loss": 0.0852,
"num_input_tokens_seen": 26112000,
"step": 25500
},
{
"epoch": 0.23989444644356483,
"grad_norm": 1.9229296445846558,
"learning_rate": 4.880057390132957e-05,
"loss": 0.0828,
"num_input_tokens_seen": 26624000,
"step": 26000
},
{
"epoch": 0.24450780118286417,
"grad_norm": 1.968058466911316,
"learning_rate": 4.877750712763308e-05,
"loss": 0.0822,
"num_input_tokens_seen": 27136000,
"step": 26500
},
{
"epoch": 0.24912115592216347,
"grad_norm": 1.6034080982208252,
"learning_rate": 4.875444035393658e-05,
"loss": 0.0822,
"num_input_tokens_seen": 27648000,
"step": 27000
},
{
"epoch": 0.2537345106614628,
"grad_norm": 1.7301759719848633,
"learning_rate": 4.873137358024008e-05,
"loss": 0.0833,
"num_input_tokens_seen": 28160000,
"step": 27500
},
{
"epoch": 0.2583478654007621,
"grad_norm": 2.2902233600616455,
"learning_rate": 4.8708306806543585e-05,
"loss": 0.0904,
"num_input_tokens_seen": 28672000,
"step": 28000
},
{
"epoch": 0.26296122014006146,
"grad_norm": 2.805758476257324,
"learning_rate": 4.868524003284709e-05,
"loss": 0.0854,
"num_input_tokens_seen": 29184000,
"step": 28500
},
{
"epoch": 0.26757457487936076,
"grad_norm": 1.0350342988967896,
"learning_rate": 4.8662173259150587e-05,
"loss": 0.0806,
"num_input_tokens_seen": 29696000,
"step": 29000
},
{
"epoch": 0.2721879296186601,
"grad_norm": 0.6509085893630981,
"learning_rate": 4.8639106485454094e-05,
"loss": 0.0846,
"num_input_tokens_seen": 30208000,
"step": 29500
},
{
"epoch": 0.2768012843579594,
"grad_norm": 1.2850301265716553,
"learning_rate": 4.86160397117576e-05,
"loss": 0.0857,
"num_input_tokens_seen": 30720000,
"step": 30000
},
{
"epoch": 0.2814146390972587,
"grad_norm": 1.7259219884872437,
"learning_rate": 4.85929729380611e-05,
"loss": 0.0839,
"num_input_tokens_seen": 31232000,
"step": 30500
},
{
"epoch": 0.2860279938365581,
"grad_norm": 1.7700318098068237,
"learning_rate": 4.85699061643646e-05,
"loss": 0.0768,
"num_input_tokens_seen": 31744000,
"step": 31000
},
{
"epoch": 0.2906413485758574,
"grad_norm": 1.1451270580291748,
"learning_rate": 4.854683939066811e-05,
"loss": 0.0824,
"num_input_tokens_seen": 32256000,
"step": 31500
},
{
"epoch": 0.29525470331515674,
"grad_norm": 1.772096872329712,
"learning_rate": 4.852377261697161e-05,
"loss": 0.0847,
"num_input_tokens_seen": 32768000,
"step": 32000
},
{
"epoch": 0.29986805805445604,
"grad_norm": 1.671513557434082,
"learning_rate": 4.850070584327511e-05,
"loss": 0.0838,
"num_input_tokens_seen": 33280000,
"step": 32500
},
{
"epoch": 0.30448141279375535,
"grad_norm": 0.9703548550605774,
"learning_rate": 4.847763906957862e-05,
"loss": 0.08,
"num_input_tokens_seen": 33792000,
"step": 33000
},
{
"epoch": 0.3090947675330547,
"grad_norm": 0.7928164601325989,
"learning_rate": 4.8454572295882126e-05,
"loss": 0.08,
"num_input_tokens_seen": 34304000,
"step": 33500
},
{
"epoch": 0.313708122272354,
"grad_norm": 1.1138111352920532,
"learning_rate": 4.8431505522185626e-05,
"loss": 0.0733,
"num_input_tokens_seen": 34816000,
"step": 34000
},
{
"epoch": 0.3183214770116533,
"grad_norm": 0.89890056848526,
"learning_rate": 4.840843874848913e-05,
"loss": 0.0828,
"num_input_tokens_seen": 35328000,
"step": 34500
},
{
"epoch": 0.32293483175095267,
"grad_norm": 2.127382516860962,
"learning_rate": 4.8385371974792634e-05,
"loss": 0.0818,
"num_input_tokens_seen": 35840000,
"step": 35000
},
{
"epoch": 0.32754818649025197,
"grad_norm": 1.0730081796646118,
"learning_rate": 4.8362305201096135e-05,
"loss": 0.0776,
"num_input_tokens_seen": 36352000,
"step": 35500
},
{
"epoch": 0.3321615412295513,
"grad_norm": 0.5055031180381775,
"learning_rate": 4.833923842739964e-05,
"loss": 0.085,
"num_input_tokens_seen": 36864000,
"step": 36000
},
{
"epoch": 0.33677489596885063,
"grad_norm": 2.764418601989746,
"learning_rate": 4.831617165370314e-05,
"loss": 0.0795,
"num_input_tokens_seen": 37376000,
"step": 36500
},
{
"epoch": 0.34138825070814993,
"grad_norm": 2.272135019302368,
"learning_rate": 4.829310488000664e-05,
"loss": 0.0757,
"num_input_tokens_seen": 37888000,
"step": 37000
},
{
"epoch": 0.3460016054474493,
"grad_norm": 2.2221481800079346,
"learning_rate": 4.827003810631015e-05,
"loss": 0.0881,
"num_input_tokens_seen": 38400000,
"step": 37500
},
{
"epoch": 0.3506149601867486,
"grad_norm": 1.7147547006607056,
"learning_rate": 4.824697133261365e-05,
"loss": 0.0805,
"num_input_tokens_seen": 38912000,
"step": 38000
},
{
"epoch": 0.35522831492604795,
"grad_norm": 2.031804084777832,
"learning_rate": 4.822390455891715e-05,
"loss": 0.0762,
"num_input_tokens_seen": 39424000,
"step": 38500
},
{
"epoch": 0.35984166966534725,
"grad_norm": 0.8008927702903748,
"learning_rate": 4.820083778522066e-05,
"loss": 0.0794,
"num_input_tokens_seen": 39936000,
"step": 39000
},
{
"epoch": 0.36445502440464655,
"grad_norm": 1.5696818828582764,
"learning_rate": 4.8177771011524167e-05,
"loss": 0.0821,
"num_input_tokens_seen": 40448000,
"step": 39500
},
{
"epoch": 0.3690683791439459,
"grad_norm": 0.7710667252540588,
"learning_rate": 4.815470423782766e-05,
"loss": 0.0776,
"num_input_tokens_seen": 40960000,
"step": 40000
},
{
"epoch": 0.3736817338832452,
"grad_norm": 1.0794172286987305,
"learning_rate": 4.813163746413117e-05,
"loss": 0.0781,
"num_input_tokens_seen": 41472000,
"step": 40500
},
{
"epoch": 0.3782950886225445,
"grad_norm": 2.43756365776062,
"learning_rate": 4.8108570690434675e-05,
"loss": 0.0787,
"num_input_tokens_seen": 41984000,
"step": 41000
},
{
"epoch": 0.3829084433618439,
"grad_norm": 0.6750785112380981,
"learning_rate": 4.8085503916738176e-05,
"loss": 0.081,
"num_input_tokens_seen": 42496000,
"step": 41500
},
{
"epoch": 0.3875217981011432,
"grad_norm": 0.7780609726905823,
"learning_rate": 4.8062437143041676e-05,
"loss": 0.0791,
"num_input_tokens_seen": 43008000,
"step": 42000
},
{
"epoch": 0.39213515284044254,
"grad_norm": 1.1585677862167358,
"learning_rate": 4.8039370369345184e-05,
"loss": 0.0811,
"num_input_tokens_seen": 43520000,
"step": 42500
},
{
"epoch": 0.39674850757974184,
"grad_norm": 2.7044448852539062,
"learning_rate": 4.8016303595648684e-05,
"loss": 0.0775,
"num_input_tokens_seen": 44032000,
"step": 43000
},
{
"epoch": 0.40136186231904114,
"grad_norm": 2.9311044216156006,
"learning_rate": 4.799323682195219e-05,
"loss": 0.0739,
"num_input_tokens_seen": 44544000,
"step": 43500
},
{
"epoch": 0.4059752170583405,
"grad_norm": 2.255924940109253,
"learning_rate": 4.797017004825569e-05,
"loss": 0.0814,
"num_input_tokens_seen": 45056000,
"step": 44000
},
{
"epoch": 0.4105885717976398,
"grad_norm": 3.5307369232177734,
"learning_rate": 4.79471032745592e-05,
"loss": 0.0773,
"num_input_tokens_seen": 45568000,
"step": 44500
},
{
"epoch": 0.41520192653693916,
"grad_norm": 0.7721351385116577,
"learning_rate": 4.79240365008627e-05,
"loss": 0.074,
"num_input_tokens_seen": 46080000,
"step": 45000
},
{
"epoch": 0.41981528127623846,
"grad_norm": 1.668393611907959,
"learning_rate": 4.79009697271662e-05,
"loss": 0.0763,
"num_input_tokens_seen": 46592000,
"step": 45500
},
{
"epoch": 0.42442863601553776,
"grad_norm": 2.3824353218078613,
"learning_rate": 4.787790295346971e-05,
"loss": 0.0772,
"num_input_tokens_seen": 47104000,
"step": 46000
},
{
"epoch": 0.4290419907548371,
"grad_norm": 2.127598762512207,
"learning_rate": 4.785483617977321e-05,
"loss": 0.0803,
"num_input_tokens_seen": 47616000,
"step": 46500
},
{
"epoch": 0.4336553454941364,
"grad_norm": 2.958203077316284,
"learning_rate": 4.7831769406076716e-05,
"loss": 0.0781,
"num_input_tokens_seen": 48128000,
"step": 47000
},
{
"epoch": 0.4382687002334357,
"grad_norm": 0.7533183693885803,
"learning_rate": 4.7808702632380217e-05,
"loss": 0.0793,
"num_input_tokens_seen": 48640000,
"step": 47500
},
{
"epoch": 0.4428820549727351,
"grad_norm": 1.3638031482696533,
"learning_rate": 4.778563585868372e-05,
"loss": 0.081,
"num_input_tokens_seen": 49152000,
"step": 48000
},
{
"epoch": 0.4474954097120344,
"grad_norm": 1.3746527433395386,
"learning_rate": 4.7762569084987225e-05,
"loss": 0.0863,
"num_input_tokens_seen": 49664000,
"step": 48500
},
{
"epoch": 0.45210876445133374,
"grad_norm": 1.5628637075424194,
"learning_rate": 4.773950231129073e-05,
"loss": 0.0799,
"num_input_tokens_seen": 50176000,
"step": 49000
},
{
"epoch": 0.45672211919063305,
"grad_norm": 1.8787376880645752,
"learning_rate": 4.7716435537594226e-05,
"loss": 0.0782,
"num_input_tokens_seen": 50688000,
"step": 49500
},
{
"epoch": 0.46133547392993235,
"grad_norm": 1.3804419040679932,
"learning_rate": 4.769336876389773e-05,
"loss": 0.0833,
"num_input_tokens_seen": 51200000,
"step": 50000
},
{
"epoch": 0.4659488286692317,
"grad_norm": 1.6135491132736206,
"learning_rate": 4.767030199020124e-05,
"loss": 0.0762,
"num_input_tokens_seen": 51712000,
"step": 50500
},
{
"epoch": 0.470562183408531,
"grad_norm": 2.186791181564331,
"learning_rate": 4.7647235216504734e-05,
"loss": 0.0797,
"num_input_tokens_seen": 52224000,
"step": 51000
},
{
"epoch": 0.4751755381478303,
"grad_norm": 1.6921688318252563,
"learning_rate": 4.762416844280824e-05,
"loss": 0.0812,
"num_input_tokens_seen": 52736000,
"step": 51500
},
{
"epoch": 0.47978889288712967,
"grad_norm": 0.95241379737854,
"learning_rate": 4.760110166911175e-05,
"loss": 0.0788,
"num_input_tokens_seen": 53248000,
"step": 52000
},
{
"epoch": 0.484402247626429,
"grad_norm": 3.2142257690429688,
"learning_rate": 4.757803489541525e-05,
"loss": 0.0776,
"num_input_tokens_seen": 53760000,
"step": 52500
},
{
"epoch": 0.48901560236572833,
"grad_norm": 3.2678260803222656,
"learning_rate": 4.755496812171875e-05,
"loss": 0.0753,
"num_input_tokens_seen": 54272000,
"step": 53000
},
{
"epoch": 0.49362895710502763,
"grad_norm": 2.8343145847320557,
"learning_rate": 4.753190134802226e-05,
"loss": 0.0784,
"num_input_tokens_seen": 54784000,
"step": 53500
},
{
"epoch": 0.49824231184432693,
"grad_norm": 1.4818017482757568,
"learning_rate": 4.750883457432576e-05,
"loss": 0.0752,
"num_input_tokens_seen": 55296000,
"step": 54000
},
{
"epoch": 0.5028556665836262,
"grad_norm": 1.2139348983764648,
"learning_rate": 4.7485767800629265e-05,
"loss": 0.0734,
"num_input_tokens_seen": 55808000,
"step": 54500
},
{
"epoch": 0.5074690213229256,
"grad_norm": 1.3937476873397827,
"learning_rate": 4.7462701026932766e-05,
"loss": 0.0759,
"num_input_tokens_seen": 56320000,
"step": 55000
},
{
"epoch": 0.512082376062225,
"grad_norm": 1.7801790237426758,
"learning_rate": 4.743963425323627e-05,
"loss": 0.0799,
"num_input_tokens_seen": 56832000,
"step": 55500
},
{
"epoch": 0.5166957308015242,
"grad_norm": 0.9710603952407837,
"learning_rate": 4.7416567479539774e-05,
"loss": 0.0705,
"num_input_tokens_seen": 57344000,
"step": 56000
},
{
"epoch": 0.5213090855408236,
"grad_norm": 1.3923077583312988,
"learning_rate": 4.739350070584328e-05,
"loss": 0.0778,
"num_input_tokens_seen": 57856000,
"step": 56500
},
{
"epoch": 0.5259224402801229,
"grad_norm": 0.5901740193367004,
"learning_rate": 4.737043393214678e-05,
"loss": 0.0729,
"num_input_tokens_seen": 58368000,
"step": 57000
},
{
"epoch": 0.5305357950194223,
"grad_norm": 1.3465195894241333,
"learning_rate": 4.734736715845028e-05,
"loss": 0.0797,
"num_input_tokens_seen": 58880000,
"step": 57500
},
{
"epoch": 0.5351491497587215,
"grad_norm": 0.48033392429351807,
"learning_rate": 4.732430038475379e-05,
"loss": 0.0736,
"num_input_tokens_seen": 59392000,
"step": 58000
},
{
"epoch": 0.5397625044980209,
"grad_norm": 1.3446660041809082,
"learning_rate": 4.730123361105729e-05,
"loss": 0.0778,
"num_input_tokens_seen": 59904000,
"step": 58500
},
{
"epoch": 0.5443758592373202,
"grad_norm": 0.895521342754364,
"learning_rate": 4.727816683736079e-05,
"loss": 0.0754,
"num_input_tokens_seen": 60416000,
"step": 59000
},
{
"epoch": 0.5489892139766195,
"grad_norm": 1.3843989372253418,
"learning_rate": 4.72551000636643e-05,
"loss": 0.0817,
"num_input_tokens_seen": 60928000,
"step": 59500
},
{
"epoch": 0.5536025687159188,
"grad_norm": 1.5670028924942017,
"learning_rate": 4.7232033289967806e-05,
"loss": 0.0742,
"num_input_tokens_seen": 61440000,
"step": 60000
},
{
"epoch": 0.5582159234552182,
"grad_norm": 1.4761849641799927,
"learning_rate": 4.72089665162713e-05,
"loss": 0.0688,
"num_input_tokens_seen": 61952000,
"step": 60500
},
{
"epoch": 0.5628292781945174,
"grad_norm": 6.005481719970703,
"learning_rate": 4.718589974257481e-05,
"loss": 0.0836,
"num_input_tokens_seen": 62464000,
"step": 61000
},
{
"epoch": 0.5674426329338168,
"grad_norm": 1.2835499048233032,
"learning_rate": 4.7162832968878314e-05,
"loss": 0.0731,
"num_input_tokens_seen": 62976000,
"step": 61500
},
{
"epoch": 0.5720559876731162,
"grad_norm": 1.769403338432312,
"learning_rate": 4.7139766195181815e-05,
"loss": 0.079,
"num_input_tokens_seen": 63488000,
"step": 62000
},
{
"epoch": 0.5766693424124154,
"grad_norm": 1.8391185998916626,
"learning_rate": 4.7116699421485315e-05,
"loss": 0.082,
"num_input_tokens_seen": 64000000,
"step": 62500
},
{
"epoch": 0.5812826971517148,
"grad_norm": 1.3075145483016968,
"learning_rate": 4.709363264778882e-05,
"loss": 0.0753,
"num_input_tokens_seen": 64512000,
"step": 63000
},
{
"epoch": 0.5858960518910141,
"grad_norm": 2.2406928539276123,
"learning_rate": 4.707056587409232e-05,
"loss": 0.0737,
"num_input_tokens_seen": 65024000,
"step": 63500
},
{
"epoch": 0.5905094066303135,
"grad_norm": 2.2750511169433594,
"learning_rate": 4.7047499100395824e-05,
"loss": 0.077,
"num_input_tokens_seen": 65536000,
"step": 64000
},
{
"epoch": 0.5951227613696127,
"grad_norm": 1.7060987949371338,
"learning_rate": 4.702443232669933e-05,
"loss": 0.0764,
"num_input_tokens_seen": 66048000,
"step": 64500
},
{
"epoch": 0.5997361161089121,
"grad_norm": 1.3420023918151855,
"learning_rate": 4.700136555300283e-05,
"loss": 0.0803,
"num_input_tokens_seen": 66560000,
"step": 65000
},
{
"epoch": 0.6043494708482114,
"grad_norm": 0.8915556073188782,
"learning_rate": 4.697829877930634e-05,
"loss": 0.0765,
"num_input_tokens_seen": 67072000,
"step": 65500
},
{
"epoch": 0.6089628255875107,
"grad_norm": 2.3567070960998535,
"learning_rate": 4.695523200560984e-05,
"loss": 0.0739,
"num_input_tokens_seen": 67584000,
"step": 66000
},
{
"epoch": 0.61357618032681,
"grad_norm": 1.8976528644561768,
"learning_rate": 4.693216523191335e-05,
"loss": 0.0738,
"num_input_tokens_seen": 68096000,
"step": 66500
},
{
"epoch": 0.6181895350661094,
"grad_norm": 2.0413930416107178,
"learning_rate": 4.690909845821685e-05,
"loss": 0.0826,
"num_input_tokens_seen": 68608000,
"step": 67000
},
{
"epoch": 0.6228028898054087,
"grad_norm": 4.672994613647461,
"learning_rate": 4.6886031684520355e-05,
"loss": 0.0773,
"num_input_tokens_seen": 69120000,
"step": 67500
},
{
"epoch": 0.627416244544708,
"grad_norm": 1.1743087768554688,
"learning_rate": 4.6862964910823856e-05,
"loss": 0.0745,
"num_input_tokens_seen": 69632000,
"step": 68000
},
{
"epoch": 0.6320295992840074,
"grad_norm": 0.7749766707420349,
"learning_rate": 4.6839898137127356e-05,
"loss": 0.0738,
"num_input_tokens_seen": 70144000,
"step": 68500
},
{
"epoch": 0.6366429540233066,
"grad_norm": 0.5075979232788086,
"learning_rate": 4.6816831363430864e-05,
"loss": 0.0747,
"num_input_tokens_seen": 70656000,
"step": 69000
},
{
"epoch": 0.641256308762606,
"grad_norm": 2.802272081375122,
"learning_rate": 4.679376458973437e-05,
"loss": 0.0825,
"num_input_tokens_seen": 71168000,
"step": 69500
},
{
"epoch": 0.6458696635019053,
"grad_norm": 1.798438549041748,
"learning_rate": 4.6770697816037865e-05,
"loss": 0.0766,
"num_input_tokens_seen": 71680000,
"step": 70000
},
{
"epoch": 0.6504830182412047,
"grad_norm": 1.7648403644561768,
"learning_rate": 4.674763104234137e-05,
"loss": 0.077,
"num_input_tokens_seen": 72192000,
"step": 70500
},
{
"epoch": 0.6550963729805039,
"grad_norm": 2.0195560455322266,
"learning_rate": 4.672456426864488e-05,
"loss": 0.0767,
"num_input_tokens_seen": 72704000,
"step": 71000
},
{
"epoch": 0.6597097277198033,
"grad_norm": 3.9862349033355713,
"learning_rate": 4.670149749494837e-05,
"loss": 0.0745,
"num_input_tokens_seen": 73216000,
"step": 71500
},
{
"epoch": 0.6643230824591027,
"grad_norm": 2.7226781845092773,
"learning_rate": 4.667843072125188e-05,
"loss": 0.0703,
"num_input_tokens_seen": 73728000,
"step": 72000
},
{
"epoch": 0.6689364371984019,
"grad_norm": 2.0484044551849365,
"learning_rate": 4.665536394755539e-05,
"loss": 0.0765,
"num_input_tokens_seen": 74240000,
"step": 72500
},
{
"epoch": 0.6735497919377013,
"grad_norm": 0.4825538694858551,
"learning_rate": 4.663229717385889e-05,
"loss": 0.0823,
"num_input_tokens_seen": 74752000,
"step": 73000
},
{
"epoch": 0.6781631466770006,
"grad_norm": 1.2127926349639893,
"learning_rate": 4.660923040016239e-05,
"loss": 0.0754,
"num_input_tokens_seen": 75264000,
"step": 73500
},
{
"epoch": 0.6827765014162999,
"grad_norm": 3.139049768447876,
"learning_rate": 4.6586163626465897e-05,
"loss": 0.0749,
"num_input_tokens_seen": 75776000,
"step": 74000
},
{
"epoch": 0.6873898561555992,
"grad_norm": 2.038872480392456,
"learning_rate": 4.65630968527694e-05,
"loss": 0.0753,
"num_input_tokens_seen": 76288000,
"step": 74500
},
{
"epoch": 0.6920032108948986,
"grad_norm": 4.1413469314575195,
"learning_rate": 4.6540030079072904e-05,
"loss": 0.0761,
"num_input_tokens_seen": 76800000,
"step": 75000
},
{
"epoch": 0.6966165656341978,
"grad_norm": 1.3078006505966187,
"learning_rate": 4.6516963305376405e-05,
"loss": 0.0766,
"num_input_tokens_seen": 77312000,
"step": 75500
},
{
"epoch": 0.7012299203734972,
"grad_norm": 1.2052334547042847,
"learning_rate": 4.649389653167991e-05,
"loss": 0.0749,
"num_input_tokens_seen": 77824000,
"step": 76000
},
{
"epoch": 0.7058432751127965,
"grad_norm": 1.5266985893249512,
"learning_rate": 4.647082975798341e-05,
"loss": 0.0768,
"num_input_tokens_seen": 78336000,
"step": 76500
},
{
"epoch": 0.7104566298520959,
"grad_norm": 13.878520011901855,
"learning_rate": 4.6447762984286914e-05,
"loss": 0.0813,
"num_input_tokens_seen": 78848000,
"step": 77000
},
{
"epoch": 0.7150699845913951,
"grad_norm": 0.8548376560211182,
"learning_rate": 4.642469621059042e-05,
"loss": 0.0693,
"num_input_tokens_seen": 79360000,
"step": 77500
},
{
"epoch": 0.7196833393306945,
"grad_norm": 1.8979346752166748,
"learning_rate": 4.640162943689392e-05,
"loss": 0.0795,
"num_input_tokens_seen": 79872000,
"step": 78000
},
{
"epoch": 0.7242966940699939,
"grad_norm": 0.6193153262138367,
"learning_rate": 4.637856266319743e-05,
"loss": 0.0776,
"num_input_tokens_seen": 80384000,
"step": 78500
},
{
"epoch": 0.7289100488092931,
"grad_norm": 1.736380934715271,
"learning_rate": 4.635549588950093e-05,
"loss": 0.079,
"num_input_tokens_seen": 80896000,
"step": 79000
},
{
"epoch": 0.7335234035485925,
"grad_norm": 3.559295415878296,
"learning_rate": 4.633242911580443e-05,
"loss": 0.0792,
"num_input_tokens_seen": 81408000,
"step": 79500
},
{
"epoch": 0.7381367582878918,
"grad_norm": 1.017986536026001,
"learning_rate": 4.630936234210794e-05,
"loss": 0.0782,
"num_input_tokens_seen": 81920000,
"step": 80000
},
{
"epoch": 0.7427501130271911,
"grad_norm": 1.2457808256149292,
"learning_rate": 4.6286295568411445e-05,
"loss": 0.0766,
"num_input_tokens_seen": 82432000,
"step": 80500
},
{
"epoch": 0.7473634677664904,
"grad_norm": 0.6746057271957397,
"learning_rate": 4.626322879471494e-05,
"loss": 0.0728,
"num_input_tokens_seen": 82944000,
"step": 81000
},
{
"epoch": 0.7519768225057898,
"grad_norm": 1.1048623323440552,
"learning_rate": 4.6240162021018446e-05,
"loss": 0.0763,
"num_input_tokens_seen": 83456000,
"step": 81500
},
{
"epoch": 0.756590177245089,
"grad_norm": 2.0804615020751953,
"learning_rate": 4.621709524732195e-05,
"loss": 0.0736,
"num_input_tokens_seen": 83968000,
"step": 82000
},
{
"epoch": 0.7612035319843884,
"grad_norm": 0.7726876735687256,
"learning_rate": 4.6194028473625454e-05,
"loss": 0.0756,
"num_input_tokens_seen": 84480000,
"step": 82500
},
{
"epoch": 0.7658168867236878,
"grad_norm": 1.618414044380188,
"learning_rate": 4.6170961699928954e-05,
"loss": 0.0736,
"num_input_tokens_seen": 84992000,
"step": 83000
},
{
"epoch": 0.7704302414629871,
"grad_norm": 0.2806508243083954,
"learning_rate": 4.614789492623246e-05,
"loss": 0.0757,
"num_input_tokens_seen": 85504000,
"step": 83500
},
{
"epoch": 0.7750435962022864,
"grad_norm": 1.093205451965332,
"learning_rate": 4.612482815253596e-05,
"loss": 0.0746,
"num_input_tokens_seen": 86016000,
"step": 84000
},
{
"epoch": 0.7796569509415857,
"grad_norm": 0.8395510911941528,
"learning_rate": 4.610176137883946e-05,
"loss": 0.0728,
"num_input_tokens_seen": 86528000,
"step": 84500
},
{
"epoch": 0.7842703056808851,
"grad_norm": 5.429121017456055,
"learning_rate": 4.607869460514297e-05,
"loss": 0.0752,
"num_input_tokens_seen": 87040000,
"step": 85000
},
{
"epoch": 0.7888836604201843,
"grad_norm": 1.0684977769851685,
"learning_rate": 4.605562783144647e-05,
"loss": 0.0734,
"num_input_tokens_seen": 87552000,
"step": 85500
},
{
"epoch": 0.7934970151594837,
"grad_norm": 4.412910461425781,
"learning_rate": 4.603256105774998e-05,
"loss": 0.0724,
"num_input_tokens_seen": 88064000,
"step": 86000
},
{
"epoch": 0.798110369898783,
"grad_norm": 1.352186918258667,
"learning_rate": 4.600949428405348e-05,
"loss": 0.0752,
"num_input_tokens_seen": 88576000,
"step": 86500
},
{
"epoch": 0.8027237246380823,
"grad_norm": 3.716979742050171,
"learning_rate": 4.5986427510356986e-05,
"loss": 0.0712,
"num_input_tokens_seen": 89088000,
"step": 87000
},
{
"epoch": 0.8073370793773816,
"grad_norm": 1.6584104299545288,
"learning_rate": 4.596336073666049e-05,
"loss": 0.0733,
"num_input_tokens_seen": 89600000,
"step": 87500
},
{
"epoch": 0.811950434116681,
"grad_norm": 2.3811452388763428,
"learning_rate": 4.5940293962963994e-05,
"loss": 0.0763,
"num_input_tokens_seen": 90112000,
"step": 88000
},
{
"epoch": 0.8165637888559802,
"grad_norm": 1.4352256059646606,
"learning_rate": 4.5917227189267495e-05,
"loss": 0.0696,
"num_input_tokens_seen": 90624000,
"step": 88500
},
{
"epoch": 0.8211771435952796,
"grad_norm": 2.95996356010437,
"learning_rate": 4.5894160415570995e-05,
"loss": 0.0675,
"num_input_tokens_seen": 91136000,
"step": 89000
},
{
"epoch": 0.825790498334579,
"grad_norm": 1.790480375289917,
"learning_rate": 4.58710936418745e-05,
"loss": 0.0737,
"num_input_tokens_seen": 91648000,
"step": 89500
},
{
"epoch": 0.8304038530738783,
"grad_norm": 2.4636244773864746,
"learning_rate": 4.5848026868178e-05,
"loss": 0.0725,
"num_input_tokens_seen": 92160000,
"step": 90000
},
{
"epoch": 0.8350172078131776,
"grad_norm": 1.4085214138031006,
"learning_rate": 4.5824960094481504e-05,
"loss": 0.0801,
"num_input_tokens_seen": 92672000,
"step": 90500
},
{
"epoch": 0.8396305625524769,
"grad_norm": 1.5080194473266602,
"learning_rate": 4.580189332078501e-05,
"loss": 0.0707,
"num_input_tokens_seen": 93184000,
"step": 91000
},
{
"epoch": 0.8442439172917763,
"grad_norm": 0.8035141229629517,
"learning_rate": 4.577882654708852e-05,
"loss": 0.0775,
"num_input_tokens_seen": 93696000,
"step": 91500
},
{
"epoch": 0.8488572720310755,
"grad_norm": 1.832581639289856,
"learning_rate": 4.575575977339201e-05,
"loss": 0.076,
"num_input_tokens_seen": 94208000,
"step": 92000
},
{
"epoch": 0.8534706267703749,
"grad_norm": 0.5887289047241211,
"learning_rate": 4.573269299969552e-05,
"loss": 0.0752,
"num_input_tokens_seen": 94720000,
"step": 92500
},
{
"epoch": 0.8580839815096742,
"grad_norm": 0.7849867939949036,
"learning_rate": 4.570962622599903e-05,
"loss": 0.0815,
"num_input_tokens_seen": 95232000,
"step": 93000
},
{
"epoch": 0.8626973362489735,
"grad_norm": 2.76053524017334,
"learning_rate": 4.568655945230253e-05,
"loss": 0.0696,
"num_input_tokens_seen": 95744000,
"step": 93500
},
{
"epoch": 0.8673106909882728,
"grad_norm": 0.608044445514679,
"learning_rate": 4.566349267860603e-05,
"loss": 0.0764,
"num_input_tokens_seen": 96256000,
"step": 94000
},
{
"epoch": 0.8719240457275722,
"grad_norm": 2.4751555919647217,
"learning_rate": 4.5640425904909536e-05,
"loss": 0.0706,
"num_input_tokens_seen": 96768000,
"step": 94500
},
{
"epoch": 0.8765374004668715,
"grad_norm": 0.5605325698852539,
"learning_rate": 4.5617359131213036e-05,
"loss": 0.074,
"num_input_tokens_seen": 97280000,
"step": 95000
},
{
"epoch": 0.8811507552061708,
"grad_norm": 2.0805656909942627,
"learning_rate": 4.5594292357516544e-05,
"loss": 0.0723,
"num_input_tokens_seen": 97792000,
"step": 95500
},
{
"epoch": 0.8857641099454702,
"grad_norm": 0.8538010120391846,
"learning_rate": 4.5571225583820044e-05,
"loss": 0.0755,
"num_input_tokens_seen": 98304000,
"step": 96000
},
{
"epoch": 0.8903774646847694,
"grad_norm": 0.7344834804534912,
"learning_rate": 4.5548158810123545e-05,
"loss": 0.0722,
"num_input_tokens_seen": 98816000,
"step": 96500
},
{
"epoch": 0.8949908194240688,
"grad_norm": 0.9666327238082886,
"learning_rate": 4.552509203642705e-05,
"loss": 0.0777,
"num_input_tokens_seen": 99328000,
"step": 97000
},
{
"epoch": 0.8996041741633681,
"grad_norm": 1.5512099266052246,
"learning_rate": 4.550202526273055e-05,
"loss": 0.0751,
"num_input_tokens_seen": 99840000,
"step": 97500
},
{
"epoch": 0.9042175289026675,
"grad_norm": 0.9923927187919617,
"learning_rate": 4.547895848903406e-05,
"loss": 0.073,
"num_input_tokens_seen": 100352000,
"step": 98000
},
{
"epoch": 0.9088308836419667,
"grad_norm": 1.5789976119995117,
"learning_rate": 4.545589171533756e-05,
"loss": 0.068,
"num_input_tokens_seen": 100864000,
"step": 98500
},
{
"epoch": 0.9134442383812661,
"grad_norm": 0.3622562885284424,
"learning_rate": 4.543282494164107e-05,
"loss": 0.0711,
"num_input_tokens_seen": 101376000,
"step": 99000
},
{
"epoch": 0.9180575931205655,
"grad_norm": 1.9762753248214722,
"learning_rate": 4.540975816794457e-05,
"loss": 0.0678,
"num_input_tokens_seen": 101888000,
"step": 99500
},
{
"epoch": 0.9226709478598647,
"grad_norm": 2.144947052001953,
"learning_rate": 4.538669139424807e-05,
"loss": 0.0705,
"num_input_tokens_seen": 102400000,
"step": 100000
},
{
"epoch": 0.9272843025991641,
"grad_norm": 0.5793939232826233,
"learning_rate": 4.5363624620551576e-05,
"loss": 0.0798,
"num_input_tokens_seen": 102912000,
"step": 100500
},
{
"epoch": 0.9318976573384634,
"grad_norm": 1.8652976751327515,
"learning_rate": 4.5340557846855084e-05,
"loss": 0.0723,
"num_input_tokens_seen": 103424000,
"step": 101000
},
{
"epoch": 0.9365110120777627,
"grad_norm": 1.8371716737747192,
"learning_rate": 4.531749107315858e-05,
"loss": 0.0752,
"num_input_tokens_seen": 103936000,
"step": 101500
},
{
"epoch": 0.941124366817062,
"grad_norm": 1.0695359706878662,
"learning_rate": 4.5294424299462085e-05,
"loss": 0.0786,
"num_input_tokens_seen": 104448000,
"step": 102000
},
{
"epoch": 0.9457377215563614,
"grad_norm": 1.6259958744049072,
"learning_rate": 4.527135752576559e-05,
"loss": 0.0726,
"num_input_tokens_seen": 104960000,
"step": 102500
},
{
"epoch": 0.9503510762956606,
"grad_norm": 2.0838193893432617,
"learning_rate": 4.5248290752069086e-05,
"loss": 0.0729,
"num_input_tokens_seen": 105472000,
"step": 103000
},
{
"epoch": 0.95496443103496,
"grad_norm": 1.8072469234466553,
"learning_rate": 4.5225223978372593e-05,
"loss": 0.0725,
"num_input_tokens_seen": 105984000,
"step": 103500
},
{
"epoch": 0.9595777857742593,
"grad_norm": 1.4469674825668335,
"learning_rate": 4.52021572046761e-05,
"loss": 0.0762,
"num_input_tokens_seen": 106496000,
"step": 104000
},
{
"epoch": 0.9641911405135587,
"grad_norm": 0.8151160478591919,
"learning_rate": 4.51790904309796e-05,
"loss": 0.0713,
"num_input_tokens_seen": 107008000,
"step": 104500
},
{
"epoch": 0.968804495252858,
"grad_norm": 2.5363306999206543,
"learning_rate": 4.51560236572831e-05,
"loss": 0.0717,
"num_input_tokens_seen": 107520000,
"step": 105000
},
{
"epoch": 0.9734178499921573,
"grad_norm": 2.3089513778686523,
"learning_rate": 4.513295688358661e-05,
"loss": 0.075,
"num_input_tokens_seen": 108032000,
"step": 105500
},
{
"epoch": 0.9780312047314567,
"grad_norm": 1.2738145589828491,
"learning_rate": 4.510989010989011e-05,
"loss": 0.0739,
"num_input_tokens_seen": 108544000,
"step": 106000
},
{
"epoch": 0.9826445594707559,
"grad_norm": 0.9310311675071716,
"learning_rate": 4.508682333619362e-05,
"loss": 0.0715,
"num_input_tokens_seen": 109056000,
"step": 106500
},
{
"epoch": 0.9872579142100553,
"grad_norm": 1.332413911819458,
"learning_rate": 4.506375656249712e-05,
"loss": 0.0762,
"num_input_tokens_seen": 109568000,
"step": 107000
},
{
"epoch": 0.9918712689493546,
"grad_norm": 1.171770691871643,
"learning_rate": 4.504068978880062e-05,
"loss": 0.0682,
"num_input_tokens_seen": 110080000,
"step": 107500
},
{
"epoch": 0.9964846236886539,
"grad_norm": 1.318642497062683,
"learning_rate": 4.5017623015104126e-05,
"loss": 0.0725,
"num_input_tokens_seen": 110592000,
"step": 108000
},
{
"epoch": 1.0,
"eval_combined_score": 0.07267016709729579,
"eval_loss": 0.07267016172409058,
"eval_mse": 0.07267016501992041,
"eval_runtime": 46.4186,
"eval_samples_per_second": 2075.42,
"eval_steps_per_second": 259.444,
"num_input_tokens_seen": 110981376,
"step": 108381
},
{
"epoch": 1.0010979784279532,
"grad_norm": 2.0301551818847656,
"learning_rate": 4.499455624140763e-05,
"loss": 0.0723,
"num_input_tokens_seen": 111103232,
"step": 108500
},
{
"epoch": 1.0057113331672525,
"grad_norm": 0.46064960956573486,
"learning_rate": 4.4971489467711134e-05,
"loss": 0.066,
"num_input_tokens_seen": 111615232,
"step": 109000
},
{
"epoch": 1.010324687906552,
"grad_norm": 2.481804132461548,
"learning_rate": 4.4948422694014634e-05,
"loss": 0.0567,
"num_input_tokens_seen": 112127232,
"step": 109500
},
{
"epoch": 1.0149380426458512,
"grad_norm": 1.0883979797363281,
"learning_rate": 4.492535592031814e-05,
"loss": 0.0591,
"num_input_tokens_seen": 112639232,
"step": 110000
},
{
"epoch": 1.0195513973851504,
"grad_norm": 1.5821534395217896,
"learning_rate": 4.490228914662164e-05,
"loss": 0.0575,
"num_input_tokens_seen": 113151232,
"step": 110500
},
{
"epoch": 1.02416475212445,
"grad_norm": 1.1834355592727661,
"learning_rate": 4.487922237292514e-05,
"loss": 0.0643,
"num_input_tokens_seen": 113663232,
"step": 111000
},
{
"epoch": 1.0287781068637492,
"grad_norm": 0.5016165375709534,
"learning_rate": 4.485615559922865e-05,
"loss": 0.0598,
"num_input_tokens_seen": 114175232,
"step": 111500
},
{
"epoch": 1.0333914616030484,
"grad_norm": 2.372044086456299,
"learning_rate": 4.483308882553216e-05,
"loss": 0.0608,
"num_input_tokens_seen": 114687232,
"step": 112000
},
{
"epoch": 1.0380048163423479,
"grad_norm": 1.4434441328048706,
"learning_rate": 4.481002205183565e-05,
"loss": 0.059,
"num_input_tokens_seen": 115199232,
"step": 112500
},
{
"epoch": 1.0426181710816471,
"grad_norm": 1.329825520515442,
"learning_rate": 4.478695527813916e-05,
"loss": 0.061,
"num_input_tokens_seen": 115711232,
"step": 113000
},
{
"epoch": 1.0472315258209464,
"grad_norm": 0.6627879738807678,
"learning_rate": 4.4763888504442666e-05,
"loss": 0.0562,
"num_input_tokens_seen": 116223232,
"step": 113500
},
{
"epoch": 1.0518448805602458,
"grad_norm": 1.4965338706970215,
"learning_rate": 4.474082173074617e-05,
"loss": 0.0614,
"num_input_tokens_seen": 116735232,
"step": 114000
},
{
"epoch": 1.056458235299545,
"grad_norm": 4.595455646514893,
"learning_rate": 4.471775495704967e-05,
"loss": 0.0569,
"num_input_tokens_seen": 117247232,
"step": 114500
},
{
"epoch": 1.0610715900388445,
"grad_norm": 1.5899192094802856,
"learning_rate": 4.4694688183353175e-05,
"loss": 0.058,
"num_input_tokens_seen": 117759232,
"step": 115000
},
{
"epoch": 1.0656849447781438,
"grad_norm": 1.812812328338623,
"learning_rate": 4.4671621409656675e-05,
"loss": 0.0564,
"num_input_tokens_seen": 118271232,
"step": 115500
},
{
"epoch": 1.070298299517443,
"grad_norm": 1.8089003562927246,
"learning_rate": 4.4648554635960176e-05,
"loss": 0.0664,
"num_input_tokens_seen": 118783232,
"step": 116000
},
{
"epoch": 1.0749116542567425,
"grad_norm": 2.216608762741089,
"learning_rate": 4.462548786226368e-05,
"loss": 0.0599,
"num_input_tokens_seen": 119295232,
"step": 116500
},
{
"epoch": 1.0795250089960418,
"grad_norm": 2.6362509727478027,
"learning_rate": 4.4602421088567184e-05,
"loss": 0.0585,
"num_input_tokens_seen": 119807232,
"step": 117000
},
{
"epoch": 1.084138363735341,
"grad_norm": 0.8326151371002197,
"learning_rate": 4.457935431487069e-05,
"loss": 0.0593,
"num_input_tokens_seen": 120319232,
"step": 117500
},
{
"epoch": 1.0887517184746405,
"grad_norm": 1.3363105058670044,
"learning_rate": 4.455628754117419e-05,
"loss": 0.056,
"num_input_tokens_seen": 120831232,
"step": 118000
},
{
"epoch": 1.0933650732139397,
"grad_norm": 2.2342283725738525,
"learning_rate": 4.45332207674777e-05,
"loss": 0.0607,
"num_input_tokens_seen": 121343232,
"step": 118500
},
{
"epoch": 1.097978427953239,
"grad_norm": 1.9718506336212158,
"learning_rate": 4.45101539937812e-05,
"loss": 0.0625,
"num_input_tokens_seen": 121855232,
"step": 119000
},
{
"epoch": 1.1025917826925384,
"grad_norm": 0.7142735123634338,
"learning_rate": 4.448708722008471e-05,
"loss": 0.0565,
"num_input_tokens_seen": 122367232,
"step": 119500
},
{
"epoch": 1.1072051374318377,
"grad_norm": 1.1628931760787964,
"learning_rate": 4.446402044638821e-05,
"loss": 0.0583,
"num_input_tokens_seen": 122879232,
"step": 120000
},
{
"epoch": 1.111818492171137,
"grad_norm": 1.8776410818099976,
"learning_rate": 4.444095367269171e-05,
"loss": 0.0642,
"num_input_tokens_seen": 123391232,
"step": 120500
},
{
"epoch": 1.1164318469104364,
"grad_norm": 1.5755925178527832,
"learning_rate": 4.4417886898995216e-05,
"loss": 0.0631,
"num_input_tokens_seen": 123903232,
"step": 121000
},
{
"epoch": 1.1210452016497356,
"grad_norm": 1.7925944328308105,
"learning_rate": 4.4394820125298716e-05,
"loss": 0.0603,
"num_input_tokens_seen": 124415232,
"step": 121500
},
{
"epoch": 1.125658556389035,
"grad_norm": 2.4041876792907715,
"learning_rate": 4.437175335160222e-05,
"loss": 0.0552,
"num_input_tokens_seen": 124927232,
"step": 122000
},
{
"epoch": 1.1302719111283344,
"grad_norm": 2.1456570625305176,
"learning_rate": 4.4348686577905724e-05,
"loss": 0.065,
"num_input_tokens_seen": 125439232,
"step": 122500
},
{
"epoch": 1.1348852658676336,
"grad_norm": 1.278905987739563,
"learning_rate": 4.432561980420923e-05,
"loss": 0.0648,
"num_input_tokens_seen": 125951232,
"step": 123000
},
{
"epoch": 1.1394986206069329,
"grad_norm": 1.4145876169204712,
"learning_rate": 4.4302553030512725e-05,
"loss": 0.0603,
"num_input_tokens_seen": 126463232,
"step": 123500
},
{
"epoch": 1.1441119753462323,
"grad_norm": 1.247292160987854,
"learning_rate": 4.427948625681623e-05,
"loss": 0.0616,
"num_input_tokens_seen": 126975232,
"step": 124000
},
{
"epoch": 1.1487253300855316,
"grad_norm": 1.0648530721664429,
"learning_rate": 4.425641948311974e-05,
"loss": 0.0577,
"num_input_tokens_seen": 127487232,
"step": 124500
},
{
"epoch": 1.1533386848248308,
"grad_norm": 2.285616874694824,
"learning_rate": 4.423335270942324e-05,
"loss": 0.0574,
"num_input_tokens_seen": 127999232,
"step": 125000
},
{
"epoch": 1.1579520395641303,
"grad_norm": 1.124847173690796,
"learning_rate": 4.421028593572674e-05,
"loss": 0.0599,
"num_input_tokens_seen": 128511232,
"step": 125500
},
{
"epoch": 1.1625653943034295,
"grad_norm": 2.4443585872650146,
"learning_rate": 4.418721916203025e-05,
"loss": 0.0568,
"num_input_tokens_seen": 129023232,
"step": 126000
},
{
"epoch": 1.167178749042729,
"grad_norm": 0.8579834699630737,
"learning_rate": 4.416415238833375e-05,
"loss": 0.0628,
"num_input_tokens_seen": 129535232,
"step": 126500
},
{
"epoch": 1.1717921037820282,
"grad_norm": 3.7771518230438232,
"learning_rate": 4.4141085614637256e-05,
"loss": 0.0618,
"num_input_tokens_seen": 130047232,
"step": 127000
},
{
"epoch": 1.1764054585213275,
"grad_norm": 1.2302302122116089,
"learning_rate": 4.411801884094076e-05,
"loss": 0.0569,
"num_input_tokens_seen": 130559232,
"step": 127500
},
{
"epoch": 1.1810188132606267,
"grad_norm": 5.366886615753174,
"learning_rate": 4.409495206724426e-05,
"loss": 0.0581,
"num_input_tokens_seen": 131071232,
"step": 128000
},
{
"epoch": 1.1856321679999262,
"grad_norm": 1.6237967014312744,
"learning_rate": 4.4071885293547765e-05,
"loss": 0.0564,
"num_input_tokens_seen": 131583232,
"step": 128500
},
{
"epoch": 1.1902455227392255,
"grad_norm": 1.025489091873169,
"learning_rate": 4.4048818519851265e-05,
"loss": 0.062,
"num_input_tokens_seen": 132095232,
"step": 129000
},
{
"epoch": 1.194858877478525,
"grad_norm": 3.0035746097564697,
"learning_rate": 4.402575174615477e-05,
"loss": 0.0567,
"num_input_tokens_seen": 132607232,
"step": 129500
},
{
"epoch": 1.1994722322178242,
"grad_norm": 0.4716099202632904,
"learning_rate": 4.4002684972458273e-05,
"loss": 0.0594,
"num_input_tokens_seen": 133119232,
"step": 130000
},
{
"epoch": 1.2040855869571234,
"grad_norm": 1.073433756828308,
"learning_rate": 4.397961819876178e-05,
"loss": 0.0638,
"num_input_tokens_seen": 133631232,
"step": 130500
},
{
"epoch": 1.208698941696423,
"grad_norm": 1.676879644393921,
"learning_rate": 4.395655142506528e-05,
"loss": 0.0665,
"num_input_tokens_seen": 134143232,
"step": 131000
},
{
"epoch": 1.2133122964357221,
"grad_norm": 1.4313554763793945,
"learning_rate": 4.393348465136878e-05,
"loss": 0.062,
"num_input_tokens_seen": 134655232,
"step": 131500
},
{
"epoch": 1.2179256511750214,
"grad_norm": 1.8880019187927246,
"learning_rate": 4.391041787767229e-05,
"loss": 0.0568,
"num_input_tokens_seen": 135167232,
"step": 132000
},
{
"epoch": 1.2225390059143209,
"grad_norm": 1.572786569595337,
"learning_rate": 4.38873511039758e-05,
"loss": 0.0581,
"num_input_tokens_seen": 135679232,
"step": 132500
},
{
"epoch": 1.22715236065362,
"grad_norm": 1.1069833040237427,
"learning_rate": 4.386428433027929e-05,
"loss": 0.0567,
"num_input_tokens_seen": 136191232,
"step": 133000
},
{
"epoch": 1.2317657153929193,
"grad_norm": 1.1832222938537598,
"learning_rate": 4.38412175565828e-05,
"loss": 0.0589,
"num_input_tokens_seen": 136703232,
"step": 133500
},
{
"epoch": 1.2363790701322188,
"grad_norm": 0.8395095467567444,
"learning_rate": 4.3818150782886305e-05,
"loss": 0.0607,
"num_input_tokens_seen": 137215232,
"step": 134000
},
{
"epoch": 1.240992424871518,
"grad_norm": 1.2240726947784424,
"learning_rate": 4.3795084009189806e-05,
"loss": 0.0639,
"num_input_tokens_seen": 137727232,
"step": 134500
},
{
"epoch": 1.2456057796108173,
"grad_norm": 0.596113383769989,
"learning_rate": 4.3772017235493306e-05,
"loss": 0.0622,
"num_input_tokens_seen": 138239232,
"step": 135000
},
{
"epoch": 1.2502191343501168,
"grad_norm": 1.9236828088760376,
"learning_rate": 4.3748950461796814e-05,
"loss": 0.0607,
"num_input_tokens_seen": 138751232,
"step": 135500
},
{
"epoch": 1.254832489089416,
"grad_norm": 0.9456164836883545,
"learning_rate": 4.3725883688100314e-05,
"loss": 0.0583,
"num_input_tokens_seen": 139263232,
"step": 136000
},
{
"epoch": 1.2594458438287153,
"grad_norm": 3.4136688709259033,
"learning_rate": 4.3702816914403815e-05,
"loss": 0.0638,
"num_input_tokens_seen": 139775232,
"step": 136500
},
{
"epoch": 1.2640591985680147,
"grad_norm": 1.01094388961792,
"learning_rate": 4.367975014070732e-05,
"loss": 0.0598,
"num_input_tokens_seen": 140287232,
"step": 137000
},
{
"epoch": 1.268672553307314,
"grad_norm": 1.1260863542556763,
"learning_rate": 4.365668336701082e-05,
"loss": 0.0586,
"num_input_tokens_seen": 140799232,
"step": 137500
},
{
"epoch": 1.2732859080466135,
"grad_norm": 3.8169174194335938,
"learning_rate": 4.363361659331433e-05,
"loss": 0.0616,
"num_input_tokens_seen": 141311232,
"step": 138000
},
{
"epoch": 1.2778992627859127,
"grad_norm": 0.5968789458274841,
"learning_rate": 4.361054981961783e-05,
"loss": 0.0586,
"num_input_tokens_seen": 141823232,
"step": 138500
},
{
"epoch": 1.282512617525212,
"grad_norm": 1.5847851037979126,
"learning_rate": 4.358748304592133e-05,
"loss": 0.0531,
"num_input_tokens_seen": 142335232,
"step": 139000
},
{
"epoch": 1.2871259722645112,
"grad_norm": 1.6152338981628418,
"learning_rate": 4.356441627222484e-05,
"loss": 0.0621,
"num_input_tokens_seen": 142847232,
"step": 139500
},
{
"epoch": 1.2917393270038107,
"grad_norm": 1.3131306171417236,
"learning_rate": 4.3541349498528346e-05,
"loss": 0.0596,
"num_input_tokens_seen": 143359232,
"step": 140000
},
{
"epoch": 1.29635268174311,
"grad_norm": 1.424111247062683,
"learning_rate": 4.351828272483185e-05,
"loss": 0.0606,
"num_input_tokens_seen": 143871232,
"step": 140500
},
{
"epoch": 1.3009660364824094,
"grad_norm": 0.8023368716239929,
"learning_rate": 4.349521595113535e-05,
"loss": 0.0644,
"num_input_tokens_seen": 144383232,
"step": 141000
},
{
"epoch": 1.3055793912217086,
"grad_norm": 1.9093987941741943,
"learning_rate": 4.3472149177438855e-05,
"loss": 0.063,
"num_input_tokens_seen": 144895232,
"step": 141500
},
{
"epoch": 1.3101927459610079,
"grad_norm": 2.1738569736480713,
"learning_rate": 4.3449082403742355e-05,
"loss": 0.0627,
"num_input_tokens_seen": 145407232,
"step": 142000
},
{
"epoch": 1.3148061007003071,
"grad_norm": 2.2907350063323975,
"learning_rate": 4.3426015630045856e-05,
"loss": 0.0628,
"num_input_tokens_seen": 145919232,
"step": 142500
},
{
"epoch": 1.3194194554396066,
"grad_norm": 1.2344714403152466,
"learning_rate": 4.340294885634936e-05,
"loss": 0.0589,
"num_input_tokens_seen": 146431232,
"step": 143000
},
{
"epoch": 1.3240328101789058,
"grad_norm": 2.3011679649353027,
"learning_rate": 4.337988208265287e-05,
"loss": 0.0639,
"num_input_tokens_seen": 146943232,
"step": 143500
},
{
"epoch": 1.3286461649182053,
"grad_norm": 1.3081352710723877,
"learning_rate": 4.3356815308956364e-05,
"loss": 0.0607,
"num_input_tokens_seen": 147455232,
"step": 144000
},
{
"epoch": 1.3332595196575046,
"grad_norm": 1.5605255365371704,
"learning_rate": 4.333374853525987e-05,
"loss": 0.0619,
"num_input_tokens_seen": 147967232,
"step": 144500
},
{
"epoch": 1.3378728743968038,
"grad_norm": 1.3698718547821045,
"learning_rate": 4.331068176156338e-05,
"loss": 0.0592,
"num_input_tokens_seen": 148479232,
"step": 145000
},
{
"epoch": 1.3424862291361033,
"grad_norm": 0.7845633029937744,
"learning_rate": 4.328761498786688e-05,
"loss": 0.0649,
"num_input_tokens_seen": 148991232,
"step": 145500
},
{
"epoch": 1.3470995838754025,
"grad_norm": 2.0420374870300293,
"learning_rate": 4.326454821417038e-05,
"loss": 0.0598,
"num_input_tokens_seen": 149503232,
"step": 146000
},
{
"epoch": 1.3517129386147018,
"grad_norm": 2.2831552028656006,
"learning_rate": 4.324148144047389e-05,
"loss": 0.0614,
"num_input_tokens_seen": 150015232,
"step": 146500
},
{
"epoch": 1.3563262933540012,
"grad_norm": 0.9809445738792419,
"learning_rate": 4.321841466677739e-05,
"loss": 0.0588,
"num_input_tokens_seen": 150527232,
"step": 147000
},
{
"epoch": 1.3609396480933005,
"grad_norm": 1.6517871618270874,
"learning_rate": 4.3195347893080895e-05,
"loss": 0.061,
"num_input_tokens_seen": 151039232,
"step": 147500
},
{
"epoch": 1.3655530028325997,
"grad_norm": 0.8756200075149536,
"learning_rate": 4.3172281119384396e-05,
"loss": 0.0601,
"num_input_tokens_seen": 151551232,
"step": 148000
},
{
"epoch": 1.3701663575718992,
"grad_norm": 4.2246317863464355,
"learning_rate": 4.31492143456879e-05,
"loss": 0.0559,
"num_input_tokens_seen": 152063232,
"step": 148500
},
{
"epoch": 1.3747797123111984,
"grad_norm": 3.220839738845825,
"learning_rate": 4.3126147571991404e-05,
"loss": 0.0572,
"num_input_tokens_seen": 152575232,
"step": 149000
},
{
"epoch": 1.379393067050498,
"grad_norm": 1.6114301681518555,
"learning_rate": 4.3103080798294905e-05,
"loss": 0.0593,
"num_input_tokens_seen": 153087232,
"step": 149500
},
{
"epoch": 1.3840064217897972,
"grad_norm": 0.6551116108894348,
"learning_rate": 4.3080014024598405e-05,
"loss": 0.0626,
"num_input_tokens_seen": 153599232,
"step": 150000
},
{
"epoch": 1.3886197765290964,
"grad_norm": 2.2895658016204834,
"learning_rate": 4.305694725090191e-05,
"loss": 0.064,
"num_input_tokens_seen": 154111232,
"step": 150500
},
{
"epoch": 1.3932331312683957,
"grad_norm": 2.927482843399048,
"learning_rate": 4.303388047720542e-05,
"loss": 0.0625,
"num_input_tokens_seen": 154623232,
"step": 151000
},
{
"epoch": 1.3978464860076951,
"grad_norm": 1.2749851942062378,
"learning_rate": 4.301081370350892e-05,
"loss": 0.0579,
"num_input_tokens_seen": 155135232,
"step": 151500
},
{
"epoch": 1.4024598407469944,
"grad_norm": 1.7866413593292236,
"learning_rate": 4.298774692981242e-05,
"loss": 0.0574,
"num_input_tokens_seen": 155647232,
"step": 152000
},
{
"epoch": 1.4070731954862938,
"grad_norm": 2.288804292678833,
"learning_rate": 4.296468015611593e-05,
"loss": 0.0631,
"num_input_tokens_seen": 156159232,
"step": 152500
},
{
"epoch": 1.411686550225593,
"grad_norm": 1.509840965270996,
"learning_rate": 4.294161338241943e-05,
"loss": 0.0585,
"num_input_tokens_seen": 156671232,
"step": 153000
},
{
"epoch": 1.4162999049648923,
"grad_norm": 0.8478446006774902,
"learning_rate": 4.291854660872293e-05,
"loss": 0.0593,
"num_input_tokens_seen": 157183232,
"step": 153500
},
{
"epoch": 1.4209132597041916,
"grad_norm": 1.4515230655670166,
"learning_rate": 4.289547983502644e-05,
"loss": 0.0599,
"num_input_tokens_seen": 157695232,
"step": 154000
},
{
"epoch": 1.425526614443491,
"grad_norm": 0.7513217926025391,
"learning_rate": 4.2872413061329944e-05,
"loss": 0.0602,
"num_input_tokens_seen": 158207232,
"step": 154500
},
{
"epoch": 1.4301399691827903,
"grad_norm": 2.4477181434631348,
"learning_rate": 4.284934628763344e-05,
"loss": 0.0583,
"num_input_tokens_seen": 158719232,
"step": 155000
},
{
"epoch": 1.4347533239220898,
"grad_norm": 1.2855825424194336,
"learning_rate": 4.2826279513936945e-05,
"loss": 0.0653,
"num_input_tokens_seen": 159231232,
"step": 155500
},
{
"epoch": 1.439366678661389,
"grad_norm": 0.5422343611717224,
"learning_rate": 4.280321274024045e-05,
"loss": 0.0601,
"num_input_tokens_seen": 159743232,
"step": 156000
},
{
"epoch": 1.4439800334006883,
"grad_norm": 1.519142746925354,
"learning_rate": 4.278014596654395e-05,
"loss": 0.0558,
"num_input_tokens_seen": 160255232,
"step": 156500
},
{
"epoch": 1.4485933881399875,
"grad_norm": 1.936989426612854,
"learning_rate": 4.2757079192847454e-05,
"loss": 0.0572,
"num_input_tokens_seen": 160767232,
"step": 157000
},
{
"epoch": 1.453206742879287,
"grad_norm": 2.0965301990509033,
"learning_rate": 4.273401241915096e-05,
"loss": 0.0655,
"num_input_tokens_seen": 161279232,
"step": 157500
},
{
"epoch": 1.4578200976185862,
"grad_norm": 1.300350308418274,
"learning_rate": 4.271094564545446e-05,
"loss": 0.0606,
"num_input_tokens_seen": 161791232,
"step": 158000
},
{
"epoch": 1.4624334523578857,
"grad_norm": 2.8612143993377686,
"learning_rate": 4.268787887175797e-05,
"loss": 0.0587,
"num_input_tokens_seen": 162303232,
"step": 158500
},
{
"epoch": 1.467046807097185,
"grad_norm": 1.869927167892456,
"learning_rate": 4.266481209806147e-05,
"loss": 0.0626,
"num_input_tokens_seen": 162815232,
"step": 159000
},
{
"epoch": 1.4716601618364842,
"grad_norm": 0.6784268617630005,
"learning_rate": 4.264174532436497e-05,
"loss": 0.0587,
"num_input_tokens_seen": 163327232,
"step": 159500
},
{
"epoch": 1.4762735165757837,
"grad_norm": 1.315468192100525,
"learning_rate": 4.261867855066848e-05,
"loss": 0.0558,
"num_input_tokens_seen": 163839232,
"step": 160000
},
{
"epoch": 1.480886871315083,
"grad_norm": 0.5266712307929993,
"learning_rate": 4.2595611776971985e-05,
"loss": 0.0601,
"num_input_tokens_seen": 164351232,
"step": 160500
},
{
"epoch": 1.4855002260543821,
"grad_norm": 0.976466178894043,
"learning_rate": 4.2572545003275486e-05,
"loss": 0.059,
"num_input_tokens_seen": 164863232,
"step": 161000
},
{
"epoch": 1.4901135807936816,
"grad_norm": 2.195340633392334,
"learning_rate": 4.2549478229578986e-05,
"loss": 0.0618,
"num_input_tokens_seen": 165375232,
"step": 161500
},
{
"epoch": 1.4947269355329809,
"grad_norm": 0.6188003420829773,
"learning_rate": 4.2526411455882494e-05,
"loss": 0.062,
"num_input_tokens_seen": 165887232,
"step": 162000
},
{
"epoch": 1.49934029027228,
"grad_norm": 1.496407389640808,
"learning_rate": 4.2503344682185994e-05,
"loss": 0.0591,
"num_input_tokens_seen": 166399232,
"step": 162500
},
{
"epoch": 1.5039536450115794,
"grad_norm": 0.94919753074646,
"learning_rate": 4.2480277908489495e-05,
"loss": 0.06,
"num_input_tokens_seen": 166911232,
"step": 163000
},
{
"epoch": 1.5085669997508788,
"grad_norm": 1.6207939386367798,
"learning_rate": 4.2457211134793e-05,
"loss": 0.0599,
"num_input_tokens_seen": 167423232,
"step": 163500
},
{
"epoch": 1.5131803544901783,
"grad_norm": 1.1205254793167114,
"learning_rate": 4.24341443610965e-05,
"loss": 0.0617,
"num_input_tokens_seen": 167935232,
"step": 164000
},
{
"epoch": 1.5177937092294775,
"grad_norm": 1.0323721170425415,
"learning_rate": 4.24110775874e-05,
"loss": 0.0601,
"num_input_tokens_seen": 168447232,
"step": 164500
},
{
"epoch": 1.5224070639687768,
"grad_norm": 0.6799350380897522,
"learning_rate": 4.238801081370351e-05,
"loss": 0.0631,
"num_input_tokens_seen": 168959232,
"step": 165000
},
{
"epoch": 1.527020418708076,
"grad_norm": 1.2749136686325073,
"learning_rate": 4.236494404000702e-05,
"loss": 0.058,
"num_input_tokens_seen": 169471232,
"step": 165500
},
{
"epoch": 1.5316337734473755,
"grad_norm": 2.35078763961792,
"learning_rate": 4.234187726631052e-05,
"loss": 0.066,
"num_input_tokens_seen": 169983232,
"step": 166000
},
{
"epoch": 1.5362471281866747,
"grad_norm": 1.8924311399459839,
"learning_rate": 4.231881049261402e-05,
"loss": 0.0591,
"num_input_tokens_seen": 170495232,
"step": 166500
},
{
"epoch": 1.5408604829259742,
"grad_norm": 2.8488757610321045,
"learning_rate": 4.2295743718917527e-05,
"loss": 0.0584,
"num_input_tokens_seen": 171007232,
"step": 167000
},
{
"epoch": 1.5454738376652735,
"grad_norm": 1.7758262157440186,
"learning_rate": 4.227267694522103e-05,
"loss": 0.0661,
"num_input_tokens_seen": 171519232,
"step": 167500
},
{
"epoch": 1.5500871924045727,
"grad_norm": 0.7893622517585754,
"learning_rate": 4.224961017152453e-05,
"loss": 0.0594,
"num_input_tokens_seen": 172031232,
"step": 168000
},
{
"epoch": 1.554700547143872,
"grad_norm": 1.069485068321228,
"learning_rate": 4.2226543397828035e-05,
"loss": 0.0656,
"num_input_tokens_seen": 172543232,
"step": 168500
},
{
"epoch": 1.5593139018831714,
"grad_norm": 2.2371785640716553,
"learning_rate": 4.2203476624131536e-05,
"loss": 0.058,
"num_input_tokens_seen": 173055232,
"step": 169000
},
{
"epoch": 1.5639272566224707,
"grad_norm": 1.76310396194458,
"learning_rate": 4.218040985043504e-05,
"loss": 0.0623,
"num_input_tokens_seen": 173567232,
"step": 169500
},
{
"epoch": 1.5685406113617701,
"grad_norm": 2.7890520095825195,
"learning_rate": 4.2157343076738544e-05,
"loss": 0.0582,
"num_input_tokens_seen": 174079232,
"step": 170000
},
{
"epoch": 1.5731539661010694,
"grad_norm": 2.2342007160186768,
"learning_rate": 4.2134276303042044e-05,
"loss": 0.0645,
"num_input_tokens_seen": 174591232,
"step": 170500
},
{
"epoch": 1.5777673208403686,
"grad_norm": 1.6538183689117432,
"learning_rate": 4.211120952934555e-05,
"loss": 0.0578,
"num_input_tokens_seen": 175103232,
"step": 171000
},
{
"epoch": 1.5823806755796679,
"grad_norm": 6.509249687194824,
"learning_rate": 4.208814275564906e-05,
"loss": 0.0638,
"num_input_tokens_seen": 175615232,
"step": 171500
},
{
"epoch": 1.5869940303189674,
"grad_norm": 2.7748773097991943,
"learning_rate": 4.206507598195256e-05,
"loss": 0.0646,
"num_input_tokens_seen": 176127232,
"step": 172000
},
{
"epoch": 1.5916073850582668,
"grad_norm": 4.16091251373291,
"learning_rate": 4.204200920825606e-05,
"loss": 0.0653,
"num_input_tokens_seen": 176639232,
"step": 172500
},
{
"epoch": 1.596220739797566,
"grad_norm": 1.4821609258651733,
"learning_rate": 4.201894243455957e-05,
"loss": 0.0642,
"num_input_tokens_seen": 177151232,
"step": 173000
},
{
"epoch": 1.6008340945368653,
"grad_norm": 0.9436431527137756,
"learning_rate": 4.199587566086307e-05,
"loss": 0.0603,
"num_input_tokens_seen": 177663232,
"step": 173500
},
{
"epoch": 1.6054474492761646,
"grad_norm": 1.735992193222046,
"learning_rate": 4.197280888716657e-05,
"loss": 0.0596,
"num_input_tokens_seen": 178175232,
"step": 174000
},
{
"epoch": 1.6100608040154638,
"grad_norm": 1.1625646352767944,
"learning_rate": 4.1949742113470076e-05,
"loss": 0.0601,
"num_input_tokens_seen": 178687232,
"step": 174500
},
{
"epoch": 1.6146741587547633,
"grad_norm": 1.0174745321273804,
"learning_rate": 4.192667533977358e-05,
"loss": 0.058,
"num_input_tokens_seen": 179199232,
"step": 175000
},
{
"epoch": 1.6192875134940627,
"grad_norm": 1.141682744026184,
"learning_rate": 4.190360856607708e-05,
"loss": 0.0622,
"num_input_tokens_seen": 179711232,
"step": 175500
},
{
"epoch": 1.623900868233362,
"grad_norm": 1.165004014968872,
"learning_rate": 4.1880541792380585e-05,
"loss": 0.0627,
"num_input_tokens_seen": 180223232,
"step": 176000
},
{
"epoch": 1.6285142229726612,
"grad_norm": 2.1781582832336426,
"learning_rate": 4.185747501868409e-05,
"loss": 0.0631,
"num_input_tokens_seen": 180735232,
"step": 176500
},
{
"epoch": 1.6331275777119605,
"grad_norm": 1.5659372806549072,
"learning_rate": 4.183440824498759e-05,
"loss": 0.0607,
"num_input_tokens_seen": 181247232,
"step": 177000
},
{
"epoch": 1.63774093245126,
"grad_norm": 1.9345473051071167,
"learning_rate": 4.181134147129109e-05,
"loss": 0.0567,
"num_input_tokens_seen": 181759232,
"step": 177500
},
{
"epoch": 1.6423542871905592,
"grad_norm": 0.8415033221244812,
"learning_rate": 4.17882746975946e-05,
"loss": 0.06,
"num_input_tokens_seen": 182271232,
"step": 178000
},
{
"epoch": 1.6469676419298587,
"grad_norm": 0.4496413767337799,
"learning_rate": 4.17652079238981e-05,
"loss": 0.0583,
"num_input_tokens_seen": 182783232,
"step": 178500
},
{
"epoch": 1.651580996669158,
"grad_norm": 1.1432942152023315,
"learning_rate": 4.174214115020161e-05,
"loss": 0.062,
"num_input_tokens_seen": 183295232,
"step": 179000
},
{
"epoch": 1.6561943514084572,
"grad_norm": 0.4867847263813019,
"learning_rate": 4.171907437650511e-05,
"loss": 0.0653,
"num_input_tokens_seen": 183807232,
"step": 179500
},
{
"epoch": 1.6608077061477564,
"grad_norm": 3.039292335510254,
"learning_rate": 4.169600760280861e-05,
"loss": 0.0578,
"num_input_tokens_seen": 184319232,
"step": 180000
},
{
"epoch": 1.6654210608870559,
"grad_norm": 2.18542218208313,
"learning_rate": 4.167294082911212e-05,
"loss": 0.064,
"num_input_tokens_seen": 184831232,
"step": 180500
},
{
"epoch": 1.6700344156263551,
"grad_norm": 0.9734911918640137,
"learning_rate": 4.164987405541562e-05,
"loss": 0.0578,
"num_input_tokens_seen": 185343232,
"step": 181000
},
{
"epoch": 1.6746477703656546,
"grad_norm": 0.8751457929611206,
"learning_rate": 4.162680728171912e-05,
"loss": 0.0593,
"num_input_tokens_seen": 185855232,
"step": 181500
},
{
"epoch": 1.6792611251049538,
"grad_norm": 1.0533229112625122,
"learning_rate": 4.1603740508022625e-05,
"loss": 0.0601,
"num_input_tokens_seen": 186367232,
"step": 182000
},
{
"epoch": 1.683874479844253,
"grad_norm": 0.742938220500946,
"learning_rate": 4.158067373432613e-05,
"loss": 0.0589,
"num_input_tokens_seen": 186879232,
"step": 182500
},
{
"epoch": 1.6884878345835523,
"grad_norm": 1.432569146156311,
"learning_rate": 4.155760696062963e-05,
"loss": 0.061,
"num_input_tokens_seen": 187391232,
"step": 183000
},
{
"epoch": 1.6931011893228518,
"grad_norm": 2.900394916534424,
"learning_rate": 4.1534540186933134e-05,
"loss": 0.058,
"num_input_tokens_seen": 187903232,
"step": 183500
},
{
"epoch": 1.6977145440621513,
"grad_norm": 1.1864616870880127,
"learning_rate": 4.151147341323664e-05,
"loss": 0.0594,
"num_input_tokens_seen": 188415232,
"step": 184000
},
{
"epoch": 1.7023278988014505,
"grad_norm": 2.3834102153778076,
"learning_rate": 4.148840663954014e-05,
"loss": 0.0623,
"num_input_tokens_seen": 188927232,
"step": 184500
},
{
"epoch": 1.7069412535407498,
"grad_norm": 2.183478355407715,
"learning_rate": 4.146533986584364e-05,
"loss": 0.0621,
"num_input_tokens_seen": 189439232,
"step": 185000
},
{
"epoch": 1.711554608280049,
"grad_norm": 1.4946995973587036,
"learning_rate": 4.144227309214715e-05,
"loss": 0.0585,
"num_input_tokens_seen": 189951232,
"step": 185500
},
{
"epoch": 1.7161679630193483,
"grad_norm": 2.6389856338500977,
"learning_rate": 4.141920631845066e-05,
"loss": 0.0641,
"num_input_tokens_seen": 190463232,
"step": 186000
},
{
"epoch": 1.7207813177586477,
"grad_norm": 1.5870720148086548,
"learning_rate": 4.139613954475416e-05,
"loss": 0.0622,
"num_input_tokens_seen": 190975232,
"step": 186500
},
{
"epoch": 1.7253946724979472,
"grad_norm": 1.0115468502044678,
"learning_rate": 4.137307277105766e-05,
"loss": 0.0602,
"num_input_tokens_seen": 191487232,
"step": 187000
},
{
"epoch": 1.7300080272372464,
"grad_norm": 2.0021095275878906,
"learning_rate": 4.1350005997361166e-05,
"loss": 0.0585,
"num_input_tokens_seen": 191999232,
"step": 187500
},
{
"epoch": 1.7346213819765457,
"grad_norm": 1.7288790941238403,
"learning_rate": 4.1326939223664666e-05,
"loss": 0.064,
"num_input_tokens_seen": 192511232,
"step": 188000
},
{
"epoch": 1.739234736715845,
"grad_norm": 2.1877362728118896,
"learning_rate": 4.130387244996817e-05,
"loss": 0.061,
"num_input_tokens_seen": 193023232,
"step": 188500
},
{
"epoch": 1.7438480914551442,
"grad_norm": 2.1723220348358154,
"learning_rate": 4.1280805676271674e-05,
"loss": 0.0611,
"num_input_tokens_seen": 193535232,
"step": 189000
},
{
"epoch": 1.7484614461944437,
"grad_norm": 1.1203595399856567,
"learning_rate": 4.1257738902575175e-05,
"loss": 0.0587,
"num_input_tokens_seen": 194047232,
"step": 189500
},
{
"epoch": 1.7530748009337431,
"grad_norm": 1.7950832843780518,
"learning_rate": 4.123467212887868e-05,
"loss": 0.0619,
"num_input_tokens_seen": 194559232,
"step": 190000
},
{
"epoch": 1.7576881556730424,
"grad_norm": 0.8511695265769958,
"learning_rate": 4.121160535518218e-05,
"loss": 0.0587,
"num_input_tokens_seen": 195071232,
"step": 190500
},
{
"epoch": 1.7623015104123416,
"grad_norm": 0.49872857332229614,
"learning_rate": 4.118853858148568e-05,
"loss": 0.0586,
"num_input_tokens_seen": 195583232,
"step": 191000
},
{
"epoch": 1.7669148651516409,
"grad_norm": 1.272387981414795,
"learning_rate": 4.116547180778919e-05,
"loss": 0.062,
"num_input_tokens_seen": 196095232,
"step": 191500
},
{
"epoch": 1.7715282198909403,
"grad_norm": 3.0328872203826904,
"learning_rate": 4.11424050340927e-05,
"loss": 0.0561,
"num_input_tokens_seen": 196607232,
"step": 192000
},
{
"epoch": 1.7761415746302396,
"grad_norm": 1.1026365756988525,
"learning_rate": 4.111933826039619e-05,
"loss": 0.061,
"num_input_tokens_seen": 197119232,
"step": 192500
},
{
"epoch": 1.780754929369539,
"grad_norm": 1.523284673690796,
"learning_rate": 4.10962714866997e-05,
"loss": 0.0647,
"num_input_tokens_seen": 197631232,
"step": 193000
},
{
"epoch": 1.7853682841088383,
"grad_norm": 2.571349859237671,
"learning_rate": 4.1073204713003207e-05,
"loss": 0.0572,
"num_input_tokens_seen": 198143232,
"step": 193500
},
{
"epoch": 1.7899816388481375,
"grad_norm": 1.1206070184707642,
"learning_rate": 4.105013793930671e-05,
"loss": 0.065,
"num_input_tokens_seen": 198655232,
"step": 194000
},
{
"epoch": 1.7945949935874368,
"grad_norm": 1.2172856330871582,
"learning_rate": 4.102707116561021e-05,
"loss": 0.0624,
"num_input_tokens_seen": 199167232,
"step": 194500
},
{
"epoch": 1.7992083483267363,
"grad_norm": 1.3785135746002197,
"learning_rate": 4.1004004391913715e-05,
"loss": 0.0619,
"num_input_tokens_seen": 199679232,
"step": 195000
},
{
"epoch": 1.8038217030660355,
"grad_norm": 1.8791236877441406,
"learning_rate": 4.0980937618217216e-05,
"loss": 0.0594,
"num_input_tokens_seen": 200191232,
"step": 195500
},
{
"epoch": 1.808435057805335,
"grad_norm": 1.4721789360046387,
"learning_rate": 4.0957870844520716e-05,
"loss": 0.0584,
"num_input_tokens_seen": 200703232,
"step": 196000
},
{
"epoch": 1.8130484125446342,
"grad_norm": 2.4450087547302246,
"learning_rate": 4.0934804070824224e-05,
"loss": 0.0622,
"num_input_tokens_seen": 201215232,
"step": 196500
},
{
"epoch": 1.8176617672839335,
"grad_norm": 2.5776455402374268,
"learning_rate": 4.091173729712773e-05,
"loss": 0.062,
"num_input_tokens_seen": 201727232,
"step": 197000
},
{
"epoch": 1.8222751220232327,
"grad_norm": 0.703079104423523,
"learning_rate": 4.088867052343123e-05,
"loss": 0.063,
"num_input_tokens_seen": 202239232,
"step": 197500
},
{
"epoch": 1.8268884767625322,
"grad_norm": 3.7383570671081543,
"learning_rate": 4.086560374973473e-05,
"loss": 0.0621,
"num_input_tokens_seen": 202751232,
"step": 198000
},
{
"epoch": 1.8315018315018317,
"grad_norm": 1.2119007110595703,
"learning_rate": 4.084253697603824e-05,
"loss": 0.0638,
"num_input_tokens_seen": 203263232,
"step": 198500
},
{
"epoch": 1.836115186241131,
"grad_norm": 1.6069977283477783,
"learning_rate": 4.081947020234174e-05,
"loss": 0.0594,
"num_input_tokens_seen": 203775232,
"step": 199000
},
{
"epoch": 1.8407285409804302,
"grad_norm": 0.5176113843917847,
"learning_rate": 4.079640342864525e-05,
"loss": 0.0565,
"num_input_tokens_seen": 204287232,
"step": 199500
},
{
"epoch": 1.8453418957197294,
"grad_norm": 1.78886878490448,
"learning_rate": 4.077333665494875e-05,
"loss": 0.0599,
"num_input_tokens_seen": 204799232,
"step": 200000
},
{
"epoch": 1.8499552504590286,
"grad_norm": 0.8037757277488708,
"learning_rate": 4.075026988125225e-05,
"loss": 0.0584,
"num_input_tokens_seen": 205311232,
"step": 200500
},
{
"epoch": 1.8545686051983281,
"grad_norm": 0.8422955274581909,
"learning_rate": 4.0727203107555756e-05,
"loss": 0.0626,
"num_input_tokens_seen": 205823232,
"step": 201000
},
{
"epoch": 1.8591819599376276,
"grad_norm": 3.384787082672119,
"learning_rate": 4.0704136333859257e-05,
"loss": 0.0603,
"num_input_tokens_seen": 206335232,
"step": 201500
},
{
"epoch": 1.8637953146769268,
"grad_norm": 1.103167176246643,
"learning_rate": 4.068106956016276e-05,
"loss": 0.0608,
"num_input_tokens_seen": 206847232,
"step": 202000
},
{
"epoch": 1.868408669416226,
"grad_norm": 0.9550286531448364,
"learning_rate": 4.0658002786466264e-05,
"loss": 0.0583,
"num_input_tokens_seen": 207359232,
"step": 202500
},
{
"epoch": 1.8730220241555253,
"grad_norm": 1.2629748582839966,
"learning_rate": 4.063493601276977e-05,
"loss": 0.0599,
"num_input_tokens_seen": 207871232,
"step": 203000
},
{
"epoch": 1.8776353788948248,
"grad_norm": 1.8319883346557617,
"learning_rate": 4.061186923907327e-05,
"loss": 0.0557,
"num_input_tokens_seen": 208383232,
"step": 203500
},
{
"epoch": 1.882248733634124,
"grad_norm": 0.8122320175170898,
"learning_rate": 4.058880246537677e-05,
"loss": 0.0631,
"num_input_tokens_seen": 208895232,
"step": 204000
},
{
"epoch": 1.8868620883734235,
"grad_norm": 1.0240248441696167,
"learning_rate": 4.056573569168028e-05,
"loss": 0.0571,
"num_input_tokens_seen": 209407232,
"step": 204500
},
{
"epoch": 1.8914754431127228,
"grad_norm": 1.0079154968261719,
"learning_rate": 4.054266891798378e-05,
"loss": 0.0591,
"num_input_tokens_seen": 209919232,
"step": 205000
},
{
"epoch": 1.896088797852022,
"grad_norm": 0.7955754399299622,
"learning_rate": 4.051960214428728e-05,
"loss": 0.0579,
"num_input_tokens_seen": 210431232,
"step": 205500
},
{
"epoch": 1.9007021525913212,
"grad_norm": 2.3598215579986572,
"learning_rate": 4.049653537059079e-05,
"loss": 0.0578,
"num_input_tokens_seen": 210943232,
"step": 206000
},
{
"epoch": 1.9053155073306207,
"grad_norm": 2.217241048812866,
"learning_rate": 4.047346859689429e-05,
"loss": 0.0615,
"num_input_tokens_seen": 211455232,
"step": 206500
},
{
"epoch": 1.90992886206992,
"grad_norm": 0.9427639245986938,
"learning_rate": 4.045040182319779e-05,
"loss": 0.0654,
"num_input_tokens_seen": 211967232,
"step": 207000
},
{
"epoch": 1.9145422168092194,
"grad_norm": 2.3182663917541504,
"learning_rate": 4.04273350495013e-05,
"loss": 0.0605,
"num_input_tokens_seen": 212479232,
"step": 207500
},
{
"epoch": 1.9191555715485187,
"grad_norm": 2.283663272857666,
"learning_rate": 4.0404268275804805e-05,
"loss": 0.059,
"num_input_tokens_seen": 212991232,
"step": 208000
},
{
"epoch": 1.923768926287818,
"grad_norm": 0.8118070960044861,
"learning_rate": 4.0381201502108305e-05,
"loss": 0.0606,
"num_input_tokens_seen": 213503232,
"step": 208500
},
{
"epoch": 1.9283822810271172,
"grad_norm": 1.4257065057754517,
"learning_rate": 4.0358134728411806e-05,
"loss": 0.0619,
"num_input_tokens_seen": 214015232,
"step": 209000
},
{
"epoch": 1.9329956357664166,
"grad_norm": 1.2044384479522705,
"learning_rate": 4.033506795471531e-05,
"loss": 0.0554,
"num_input_tokens_seen": 214527232,
"step": 209500
},
{
"epoch": 1.9376089905057161,
"grad_norm": 1.2655075788497925,
"learning_rate": 4.0312001181018814e-05,
"loss": 0.0569,
"num_input_tokens_seen": 215039232,
"step": 210000
},
{
"epoch": 1.9422223452450154,
"grad_norm": 1.7089818716049194,
"learning_rate": 4.028893440732232e-05,
"loss": 0.062,
"num_input_tokens_seen": 215551232,
"step": 210500
},
{
"epoch": 1.9468356999843146,
"grad_norm": 1.0826196670532227,
"learning_rate": 4.026586763362582e-05,
"loss": 0.0611,
"num_input_tokens_seen": 216063232,
"step": 211000
},
{
"epoch": 1.9514490547236139,
"grad_norm": 0.5117043852806091,
"learning_rate": 4.024280085992932e-05,
"loss": 0.0618,
"num_input_tokens_seen": 216575232,
"step": 211500
},
{
"epoch": 1.956062409462913,
"grad_norm": 0.4635091722011566,
"learning_rate": 4.021973408623283e-05,
"loss": 0.0617,
"num_input_tokens_seen": 217087232,
"step": 212000
},
{
"epoch": 1.9606757642022126,
"grad_norm": 2.1524128913879395,
"learning_rate": 4.019666731253634e-05,
"loss": 0.0614,
"num_input_tokens_seen": 217599232,
"step": 212500
},
{
"epoch": 1.965289118941512,
"grad_norm": 1.02557373046875,
"learning_rate": 4.017360053883983e-05,
"loss": 0.0552,
"num_input_tokens_seen": 218111232,
"step": 213000
},
{
"epoch": 1.9699024736808113,
"grad_norm": 2.18851375579834,
"learning_rate": 4.015053376514334e-05,
"loss": 0.0597,
"num_input_tokens_seen": 218623232,
"step": 213500
},
{
"epoch": 1.9745158284201105,
"grad_norm": 2.4914391040802,
"learning_rate": 4.0127466991446846e-05,
"loss": 0.0616,
"num_input_tokens_seen": 219135232,
"step": 214000
},
{
"epoch": 1.9791291831594098,
"grad_norm": 1.8353182077407837,
"learning_rate": 4.0104400217750346e-05,
"loss": 0.0675,
"num_input_tokens_seen": 219647232,
"step": 214500
},
{
"epoch": 1.983742537898709,
"grad_norm": 5.431290149688721,
"learning_rate": 4.008133344405385e-05,
"loss": 0.0568,
"num_input_tokens_seen": 220159232,
"step": 215000
},
{
"epoch": 1.9883558926380085,
"grad_norm": 0.523113489151001,
"learning_rate": 4.0058266670357354e-05,
"loss": 0.0596,
"num_input_tokens_seen": 220671232,
"step": 215500
},
{
"epoch": 1.992969247377308,
"grad_norm": 0.5525696277618408,
"learning_rate": 4.0035199896660855e-05,
"loss": 0.0589,
"num_input_tokens_seen": 221183232,
"step": 216000
},
{
"epoch": 1.9975826021166072,
"grad_norm": 2.0920755863189697,
"learning_rate": 4.0012133122964355e-05,
"loss": 0.0603,
"num_input_tokens_seen": 221695232,
"step": 216500
},
{
"epoch": 2.0,
"eval_combined_score": 0.06747195769945506,
"eval_loss": 0.0674719586968422,
"eval_mse": 0.06747195670206793,
"eval_runtime": 46.4608,
"eval_samples_per_second": 2073.535,
"eval_steps_per_second": 259.208,
"num_input_tokens_seen": 221962752,
"step": 216762
},
{
"epoch": 2.0021959568559065,
"grad_norm": 2.938506841659546,
"learning_rate": 3.998906634926786e-05,
"loss": 0.0546,
"num_input_tokens_seen": 222206464,
"step": 217000
},
{
"epoch": 2.0068093115952057,
"grad_norm": 1.5632978677749634,
"learning_rate": 3.996599957557137e-05,
"loss": 0.0497,
"num_input_tokens_seen": 222718464,
"step": 217500
},
{
"epoch": 2.011422666334505,
"grad_norm": 2.7584619522094727,
"learning_rate": 3.994293280187487e-05,
"loss": 0.0504,
"num_input_tokens_seen": 223230464,
"step": 218000
},
{
"epoch": 2.0160360210738046,
"grad_norm": 0.7712005972862244,
"learning_rate": 3.991986602817837e-05,
"loss": 0.0498,
"num_input_tokens_seen": 223742464,
"step": 218500
},
{
"epoch": 2.020649375813104,
"grad_norm": 2.087860584259033,
"learning_rate": 3.989679925448188e-05,
"loss": 0.0514,
"num_input_tokens_seen": 224254464,
"step": 219000
},
{
"epoch": 2.025262730552403,
"grad_norm": 1.5292513370513916,
"learning_rate": 3.987373248078538e-05,
"loss": 0.046,
"num_input_tokens_seen": 224766464,
"step": 219500
},
{
"epoch": 2.0298760852917024,
"grad_norm": 2.2876648902893066,
"learning_rate": 3.985066570708888e-05,
"loss": 0.0514,
"num_input_tokens_seen": 225278464,
"step": 220000
},
{
"epoch": 2.0344894400310016,
"grad_norm": 1.1318377256393433,
"learning_rate": 3.982759893339239e-05,
"loss": 0.0466,
"num_input_tokens_seen": 225790464,
"step": 220500
},
{
"epoch": 2.039102794770301,
"grad_norm": 0.5960507988929749,
"learning_rate": 3.980453215969589e-05,
"loss": 0.0484,
"num_input_tokens_seen": 226302464,
"step": 221000
},
{
"epoch": 2.0437161495096006,
"grad_norm": 1.8446494340896606,
"learning_rate": 3.9781465385999395e-05,
"loss": 0.0458,
"num_input_tokens_seen": 226814464,
"step": 221500
},
{
"epoch": 2.0483295042489,
"grad_norm": 1.8140873908996582,
"learning_rate": 3.9758398612302896e-05,
"loss": 0.0496,
"num_input_tokens_seen": 227326464,
"step": 222000
},
{
"epoch": 2.052942858988199,
"grad_norm": 0.29578447341918945,
"learning_rate": 3.9735331838606396e-05,
"loss": 0.0447,
"num_input_tokens_seen": 227838464,
"step": 222500
},
{
"epoch": 2.0575562137274983,
"grad_norm": 1.8332575559616089,
"learning_rate": 3.9712265064909904e-05,
"loss": 0.042,
"num_input_tokens_seen": 228350464,
"step": 223000
},
{
"epoch": 2.0621695684667976,
"grad_norm": 1.091813325881958,
"learning_rate": 3.968919829121341e-05,
"loss": 0.0526,
"num_input_tokens_seen": 228862464,
"step": 223500
},
{
"epoch": 2.066782923206097,
"grad_norm": 0.7884387373924255,
"learning_rate": 3.9666131517516905e-05,
"loss": 0.0455,
"num_input_tokens_seen": 229374464,
"step": 224000
},
{
"epoch": 2.0713962779453965,
"grad_norm": 2.7083017826080322,
"learning_rate": 3.964306474382041e-05,
"loss": 0.0457,
"num_input_tokens_seen": 229886464,
"step": 224500
},
{
"epoch": 2.0760096326846957,
"grad_norm": 3.8200302124023438,
"learning_rate": 3.961999797012392e-05,
"loss": 0.0459,
"num_input_tokens_seen": 230398464,
"step": 225000
},
{
"epoch": 2.080622987423995,
"grad_norm": 1.0111039876937866,
"learning_rate": 3.959693119642742e-05,
"loss": 0.0464,
"num_input_tokens_seen": 230910464,
"step": 225500
},
{
"epoch": 2.0852363421632942,
"grad_norm": 0.7892510890960693,
"learning_rate": 3.957386442273092e-05,
"loss": 0.0527,
"num_input_tokens_seen": 231422464,
"step": 226000
},
{
"epoch": 2.0898496969025935,
"grad_norm": 0.9745638370513916,
"learning_rate": 3.955079764903443e-05,
"loss": 0.0446,
"num_input_tokens_seen": 231934464,
"step": 226500
},
{
"epoch": 2.0944630516418927,
"grad_norm": 1.1187430620193481,
"learning_rate": 3.952773087533793e-05,
"loss": 0.0505,
"num_input_tokens_seen": 232446464,
"step": 227000
},
{
"epoch": 2.0990764063811924,
"grad_norm": 1.3649568557739258,
"learning_rate": 3.950466410164143e-05,
"loss": 0.0494,
"num_input_tokens_seen": 232958464,
"step": 227500
},
{
"epoch": 2.1036897611204917,
"grad_norm": 1.2664381265640259,
"learning_rate": 3.9481597327944936e-05,
"loss": 0.0425,
"num_input_tokens_seen": 233470464,
"step": 228000
},
{
"epoch": 2.108303115859791,
"grad_norm": 2.6382997035980225,
"learning_rate": 3.9458530554248444e-05,
"loss": 0.0469,
"num_input_tokens_seen": 233982464,
"step": 228500
},
{
"epoch": 2.11291647059909,
"grad_norm": 1.4181214570999146,
"learning_rate": 3.9435463780551944e-05,
"loss": 0.0465,
"num_input_tokens_seen": 234494464,
"step": 229000
},
{
"epoch": 2.1175298253383894,
"grad_norm": 1.2546645402908325,
"learning_rate": 3.9412397006855445e-05,
"loss": 0.0502,
"num_input_tokens_seen": 235006464,
"step": 229500
},
{
"epoch": 2.122143180077689,
"grad_norm": 3.3777077198028564,
"learning_rate": 3.938933023315895e-05,
"loss": 0.0513,
"num_input_tokens_seen": 235518464,
"step": 230000
},
{
"epoch": 2.1267565348169883,
"grad_norm": 1.0438088178634644,
"learning_rate": 3.936626345946245e-05,
"loss": 0.0452,
"num_input_tokens_seen": 236030464,
"step": 230500
},
{
"epoch": 2.1313698895562876,
"grad_norm": 3.252018928527832,
"learning_rate": 3.934319668576596e-05,
"loss": 0.0463,
"num_input_tokens_seen": 236542464,
"step": 231000
},
{
"epoch": 2.135983244295587,
"grad_norm": 0.6309357285499573,
"learning_rate": 3.932012991206946e-05,
"loss": 0.0456,
"num_input_tokens_seen": 237054464,
"step": 231500
},
{
"epoch": 2.140596599034886,
"grad_norm": 0.6404411196708679,
"learning_rate": 3.929706313837296e-05,
"loss": 0.0469,
"num_input_tokens_seen": 237566464,
"step": 232000
},
{
"epoch": 2.1452099537741853,
"grad_norm": 2.673940896987915,
"learning_rate": 3.927399636467647e-05,
"loss": 0.0495,
"num_input_tokens_seen": 238078464,
"step": 232500
},
{
"epoch": 2.149823308513485,
"grad_norm": 0.5295352935791016,
"learning_rate": 3.9250929590979976e-05,
"loss": 0.0488,
"num_input_tokens_seen": 238590464,
"step": 233000
},
{
"epoch": 2.1544366632527843,
"grad_norm": 2.1107120513916016,
"learning_rate": 3.922786281728347e-05,
"loss": 0.0471,
"num_input_tokens_seen": 239102464,
"step": 233500
},
{
"epoch": 2.1590500179920835,
"grad_norm": 0.7328481674194336,
"learning_rate": 3.920479604358698e-05,
"loss": 0.0482,
"num_input_tokens_seen": 239614464,
"step": 234000
},
{
"epoch": 2.1636633727313828,
"grad_norm": 0.5566291213035583,
"learning_rate": 3.9181729269890485e-05,
"loss": 0.0449,
"num_input_tokens_seen": 240126464,
"step": 234500
},
{
"epoch": 2.168276727470682,
"grad_norm": 2.311140537261963,
"learning_rate": 3.915866249619398e-05,
"loss": 0.0453,
"num_input_tokens_seen": 240638464,
"step": 235000
},
{
"epoch": 2.1728900822099813,
"grad_norm": 0.43719959259033203,
"learning_rate": 3.9135595722497486e-05,
"loss": 0.0484,
"num_input_tokens_seen": 241150464,
"step": 235500
},
{
"epoch": 2.177503436949281,
"grad_norm": 1.3434603214263916,
"learning_rate": 3.911252894880099e-05,
"loss": 0.0471,
"num_input_tokens_seen": 241662464,
"step": 236000
},
{
"epoch": 2.18211679168858,
"grad_norm": 1.4311593770980835,
"learning_rate": 3.9089462175104494e-05,
"loss": 0.0466,
"num_input_tokens_seen": 242174464,
"step": 236500
},
{
"epoch": 2.1867301464278794,
"grad_norm": 1.6135164499282837,
"learning_rate": 3.9066395401407994e-05,
"loss": 0.0459,
"num_input_tokens_seen": 242686464,
"step": 237000
},
{
"epoch": 2.1913435011671787,
"grad_norm": 0.8135620951652527,
"learning_rate": 3.90433286277115e-05,
"loss": 0.0484,
"num_input_tokens_seen": 243198464,
"step": 237500
},
{
"epoch": 2.195956855906478,
"grad_norm": 2.1880440711975098,
"learning_rate": 3.9020261854015e-05,
"loss": 0.0493,
"num_input_tokens_seen": 243710464,
"step": 238000
},
{
"epoch": 2.200570210645777,
"grad_norm": 1.676583170890808,
"learning_rate": 3.899719508031851e-05,
"loss": 0.0505,
"num_input_tokens_seen": 244222464,
"step": 238500
},
{
"epoch": 2.205183565385077,
"grad_norm": 2.2629077434539795,
"learning_rate": 3.897412830662201e-05,
"loss": 0.0501,
"num_input_tokens_seen": 244734464,
"step": 239000
},
{
"epoch": 2.209796920124376,
"grad_norm": 2.8751511573791504,
"learning_rate": 3.895106153292552e-05,
"loss": 0.0446,
"num_input_tokens_seen": 245246464,
"step": 239500
},
{
"epoch": 2.2144102748636754,
"grad_norm": 2.8819162845611572,
"learning_rate": 3.892799475922902e-05,
"loss": 0.05,
"num_input_tokens_seen": 245758464,
"step": 240000
},
{
"epoch": 2.2190236296029746,
"grad_norm": 2.6944236755371094,
"learning_rate": 3.890492798553252e-05,
"loss": 0.0491,
"num_input_tokens_seen": 246270464,
"step": 240500
},
{
"epoch": 2.223636984342274,
"grad_norm": 1.2675094604492188,
"learning_rate": 3.8881861211836026e-05,
"loss": 0.054,
"num_input_tokens_seen": 246782464,
"step": 241000
},
{
"epoch": 2.2282503390815736,
"grad_norm": 3.3482534885406494,
"learning_rate": 3.885879443813953e-05,
"loss": 0.0436,
"num_input_tokens_seen": 247294464,
"step": 241500
},
{
"epoch": 2.232863693820873,
"grad_norm": 4.079286575317383,
"learning_rate": 3.8835727664443034e-05,
"loss": 0.0451,
"num_input_tokens_seen": 247806464,
"step": 242000
},
{
"epoch": 2.237477048560172,
"grad_norm": 1.210747480392456,
"learning_rate": 3.8812660890746535e-05,
"loss": 0.0474,
"num_input_tokens_seen": 248318464,
"step": 242500
},
{
"epoch": 2.2420904032994713,
"grad_norm": 0.7511959671974182,
"learning_rate": 3.8789594117050035e-05,
"loss": 0.0548,
"num_input_tokens_seen": 248830464,
"step": 243000
},
{
"epoch": 2.2467037580387705,
"grad_norm": 2.5810165405273438,
"learning_rate": 3.876652734335354e-05,
"loss": 0.0501,
"num_input_tokens_seen": 249342464,
"step": 243500
},
{
"epoch": 2.25131711277807,
"grad_norm": 1.060328722000122,
"learning_rate": 3.874346056965705e-05,
"loss": 0.0473,
"num_input_tokens_seen": 249854464,
"step": 244000
},
{
"epoch": 2.255930467517369,
"grad_norm": 0.6183954477310181,
"learning_rate": 3.8720393795960544e-05,
"loss": 0.0486,
"num_input_tokens_seen": 250366464,
"step": 244500
},
{
"epoch": 2.2605438222566687,
"grad_norm": 1.4669181108474731,
"learning_rate": 3.869732702226405e-05,
"loss": 0.046,
"num_input_tokens_seen": 250878464,
"step": 245000
},
{
"epoch": 2.265157176995968,
"grad_norm": 0.44876328110694885,
"learning_rate": 3.867426024856756e-05,
"loss": 0.0494,
"num_input_tokens_seen": 251390464,
"step": 245500
},
{
"epoch": 2.269770531735267,
"grad_norm": 1.458533763885498,
"learning_rate": 3.865119347487106e-05,
"loss": 0.0514,
"num_input_tokens_seen": 251902464,
"step": 246000
},
{
"epoch": 2.2743838864745665,
"grad_norm": 1.5308929681777954,
"learning_rate": 3.862812670117456e-05,
"loss": 0.0482,
"num_input_tokens_seen": 252414464,
"step": 246500
},
{
"epoch": 2.2789972412138657,
"grad_norm": 2.227228879928589,
"learning_rate": 3.860505992747807e-05,
"loss": 0.0489,
"num_input_tokens_seen": 252926464,
"step": 247000
},
{
"epoch": 2.2836105959531654,
"grad_norm": 0.44453561305999756,
"learning_rate": 3.858199315378157e-05,
"loss": 0.0494,
"num_input_tokens_seen": 253438464,
"step": 247500
},
{
"epoch": 2.2882239506924646,
"grad_norm": 1.6029125452041626,
"learning_rate": 3.855892638008507e-05,
"loss": 0.0512,
"num_input_tokens_seen": 253950464,
"step": 248000
},
{
"epoch": 2.292837305431764,
"grad_norm": 0.9729604125022888,
"learning_rate": 3.8535859606388576e-05,
"loss": 0.0479,
"num_input_tokens_seen": 254462464,
"step": 248500
},
{
"epoch": 2.297450660171063,
"grad_norm": 2.042520046234131,
"learning_rate": 3.8512792832692076e-05,
"loss": 0.0505,
"num_input_tokens_seen": 254974464,
"step": 249000
},
{
"epoch": 2.3020640149103624,
"grad_norm": 0.6108492016792297,
"learning_rate": 3.8489726058995583e-05,
"loss": 0.0486,
"num_input_tokens_seen": 255486464,
"step": 249500
},
{
"epoch": 2.3066773696496616,
"grad_norm": 3.030125379562378,
"learning_rate": 3.8466659285299084e-05,
"loss": 0.0489,
"num_input_tokens_seen": 255998464,
"step": 250000
},
{
"epoch": 2.3112907243889613,
"grad_norm": 1.440781831741333,
"learning_rate": 3.844359251160259e-05,
"loss": 0.0486,
"num_input_tokens_seen": 256510464,
"step": 250500
},
{
"epoch": 2.3159040791282606,
"grad_norm": 2.0030038356781006,
"learning_rate": 3.842052573790609e-05,
"loss": 0.051,
"num_input_tokens_seen": 257022464,
"step": 251000
},
{
"epoch": 2.32051743386756,
"grad_norm": 0.7390642166137695,
"learning_rate": 3.83974589642096e-05,
"loss": 0.0524,
"num_input_tokens_seen": 257534464,
"step": 251500
},
{
"epoch": 2.325130788606859,
"grad_norm": 1.2793288230895996,
"learning_rate": 3.83743921905131e-05,
"loss": 0.0511,
"num_input_tokens_seen": 258046464,
"step": 252000
},
{
"epoch": 2.3297441433461583,
"grad_norm": 0.9258439540863037,
"learning_rate": 3.83513254168166e-05,
"loss": 0.0452,
"num_input_tokens_seen": 258558464,
"step": 252500
},
{
"epoch": 2.334357498085458,
"grad_norm": 1.6350897550582886,
"learning_rate": 3.832825864312011e-05,
"loss": 0.0512,
"num_input_tokens_seen": 259070464,
"step": 253000
},
{
"epoch": 2.3389708528247573,
"grad_norm": 0.529399037361145,
"learning_rate": 3.830519186942361e-05,
"loss": 0.0508,
"num_input_tokens_seen": 259582464,
"step": 253500
},
{
"epoch": 2.3435842075640565,
"grad_norm": 1.1488155126571655,
"learning_rate": 3.828212509572711e-05,
"loss": 0.0507,
"num_input_tokens_seen": 260094464,
"step": 254000
},
{
"epoch": 2.3481975623033557,
"grad_norm": 1.7055829763412476,
"learning_rate": 3.8259058322030616e-05,
"loss": 0.0512,
"num_input_tokens_seen": 260606464,
"step": 254500
},
{
"epoch": 2.352810917042655,
"grad_norm": 1.6156001091003418,
"learning_rate": 3.8235991548334124e-05,
"loss": 0.0475,
"num_input_tokens_seen": 261118464,
"step": 255000
},
{
"epoch": 2.3574242717819542,
"grad_norm": 1.6147477626800537,
"learning_rate": 3.821292477463762e-05,
"loss": 0.0486,
"num_input_tokens_seen": 261630464,
"step": 255500
},
{
"epoch": 2.3620376265212535,
"grad_norm": 2.267575979232788,
"learning_rate": 3.8189858000941125e-05,
"loss": 0.0531,
"num_input_tokens_seen": 262142464,
"step": 256000
},
{
"epoch": 2.366650981260553,
"grad_norm": 4.673060417175293,
"learning_rate": 3.816679122724463e-05,
"loss": 0.0482,
"num_input_tokens_seen": 262654464,
"step": 256500
},
{
"epoch": 2.3712643359998524,
"grad_norm": 0.9855422377586365,
"learning_rate": 3.814372445354813e-05,
"loss": 0.0513,
"num_input_tokens_seen": 263166464,
"step": 257000
},
{
"epoch": 2.3758776907391517,
"grad_norm": 2.0277483463287354,
"learning_rate": 3.8120657679851633e-05,
"loss": 0.0486,
"num_input_tokens_seen": 263678464,
"step": 257500
},
{
"epoch": 2.380491045478451,
"grad_norm": 2.461817979812622,
"learning_rate": 3.809759090615514e-05,
"loss": 0.0467,
"num_input_tokens_seen": 264190464,
"step": 258000
},
{
"epoch": 2.38510440021775,
"grad_norm": 1.2786630392074585,
"learning_rate": 3.807452413245864e-05,
"loss": 0.0449,
"num_input_tokens_seen": 264702464,
"step": 258500
},
{
"epoch": 2.38971775495705,
"grad_norm": 0.7494092583656311,
"learning_rate": 3.805145735876215e-05,
"loss": 0.0444,
"num_input_tokens_seen": 265214464,
"step": 259000
},
{
"epoch": 2.394331109696349,
"grad_norm": 0.7989722490310669,
"learning_rate": 3.802839058506565e-05,
"loss": 0.0474,
"num_input_tokens_seen": 265726464,
"step": 259500
},
{
"epoch": 2.3989444644356483,
"grad_norm": 1.17472505569458,
"learning_rate": 3.800532381136916e-05,
"loss": 0.0508,
"num_input_tokens_seen": 266238464,
"step": 260000
},
{
"epoch": 2.4035578191749476,
"grad_norm": 4.456437587738037,
"learning_rate": 3.798225703767266e-05,
"loss": 0.0536,
"num_input_tokens_seen": 266750464,
"step": 260500
},
{
"epoch": 2.408171173914247,
"grad_norm": 1.390002727508545,
"learning_rate": 3.795919026397616e-05,
"loss": 0.0489,
"num_input_tokens_seen": 267262464,
"step": 261000
},
{
"epoch": 2.412784528653546,
"grad_norm": 3.4362330436706543,
"learning_rate": 3.7936123490279665e-05,
"loss": 0.0455,
"num_input_tokens_seen": 267774464,
"step": 261500
},
{
"epoch": 2.417397883392846,
"grad_norm": 3.1407535076141357,
"learning_rate": 3.7913056716583166e-05,
"loss": 0.0488,
"num_input_tokens_seen": 268286464,
"step": 262000
},
{
"epoch": 2.422011238132145,
"grad_norm": 5.290740966796875,
"learning_rate": 3.788998994288667e-05,
"loss": 0.0529,
"num_input_tokens_seen": 268798464,
"step": 262500
},
{
"epoch": 2.4266245928714443,
"grad_norm": 0.8178442716598511,
"learning_rate": 3.7866923169190174e-05,
"loss": 0.0488,
"num_input_tokens_seen": 269310464,
"step": 263000
},
{
"epoch": 2.4312379476107435,
"grad_norm": 1.9484672546386719,
"learning_rate": 3.7843856395493674e-05,
"loss": 0.047,
"num_input_tokens_seen": 269822464,
"step": 263500
},
{
"epoch": 2.4358513023500428,
"grad_norm": 3.035595178604126,
"learning_rate": 3.782078962179718e-05,
"loss": 0.0465,
"num_input_tokens_seen": 270334464,
"step": 264000
},
{
"epoch": 2.4404646570893425,
"grad_norm": 1.731019377708435,
"learning_rate": 3.779772284810069e-05,
"loss": 0.0458,
"num_input_tokens_seen": 270846464,
"step": 264500
},
{
"epoch": 2.4450780118286417,
"grad_norm": 1.4459056854248047,
"learning_rate": 3.777465607440418e-05,
"loss": 0.0469,
"num_input_tokens_seen": 271358464,
"step": 265000
},
{
"epoch": 2.449691366567941,
"grad_norm": 1.475520372390747,
"learning_rate": 3.775158930070769e-05,
"loss": 0.0489,
"num_input_tokens_seen": 271870464,
"step": 265500
},
{
"epoch": 2.45430472130724,
"grad_norm": 1.0083856582641602,
"learning_rate": 3.77285225270112e-05,
"loss": 0.0474,
"num_input_tokens_seen": 272382464,
"step": 266000
},
{
"epoch": 2.4589180760465394,
"grad_norm": 1.0660340785980225,
"learning_rate": 3.770545575331469e-05,
"loss": 0.0531,
"num_input_tokens_seen": 272894464,
"step": 266500
},
{
"epoch": 2.4635314307858387,
"grad_norm": 2.4508252143859863,
"learning_rate": 3.76823889796182e-05,
"loss": 0.0484,
"num_input_tokens_seen": 273406464,
"step": 267000
},
{
"epoch": 2.468144785525138,
"grad_norm": 1.2447962760925293,
"learning_rate": 3.7659322205921706e-05,
"loss": 0.0543,
"num_input_tokens_seen": 273918464,
"step": 267500
},
{
"epoch": 2.4727581402644376,
"grad_norm": 0.9269862174987793,
"learning_rate": 3.763625543222521e-05,
"loss": 0.047,
"num_input_tokens_seen": 274430464,
"step": 268000
},
{
"epoch": 2.477371495003737,
"grad_norm": 1.8680906295776367,
"learning_rate": 3.761318865852871e-05,
"loss": 0.0488,
"num_input_tokens_seen": 274942464,
"step": 268500
},
{
"epoch": 2.481984849743036,
"grad_norm": 2.0206573009490967,
"learning_rate": 3.7590121884832215e-05,
"loss": 0.0481,
"num_input_tokens_seen": 275454464,
"step": 269000
},
{
"epoch": 2.4865982044823354,
"grad_norm": 1.7884100675582886,
"learning_rate": 3.7567055111135715e-05,
"loss": 0.0491,
"num_input_tokens_seen": 275966464,
"step": 269500
},
{
"epoch": 2.4912115592216346,
"grad_norm": 0.8701728582382202,
"learning_rate": 3.754398833743922e-05,
"loss": 0.0482,
"num_input_tokens_seen": 276478464,
"step": 270000
},
{
"epoch": 2.4958249139609343,
"grad_norm": 1.0109634399414062,
"learning_rate": 3.752092156374272e-05,
"loss": 0.0501,
"num_input_tokens_seen": 276990464,
"step": 270500
},
{
"epoch": 2.5004382687002336,
"grad_norm": 2.7722220420837402,
"learning_rate": 3.749785479004623e-05,
"loss": 0.0521,
"num_input_tokens_seen": 277502464,
"step": 271000
},
{
"epoch": 2.505051623439533,
"grad_norm": 0.6980007886886597,
"learning_rate": 3.747478801634973e-05,
"loss": 0.0489,
"num_input_tokens_seen": 278014464,
"step": 271500
},
{
"epoch": 2.509664978178832,
"grad_norm": 1.2792749404907227,
"learning_rate": 3.745172124265324e-05,
"loss": 0.0489,
"num_input_tokens_seen": 278526464,
"step": 272000
},
{
"epoch": 2.5142783329181313,
"grad_norm": 2.294569969177246,
"learning_rate": 3.742865446895674e-05,
"loss": 0.0499,
"num_input_tokens_seen": 279038464,
"step": 272500
},
{
"epoch": 2.5188916876574305,
"grad_norm": 0.667633593082428,
"learning_rate": 3.740558769526024e-05,
"loss": 0.0493,
"num_input_tokens_seen": 279550464,
"step": 273000
},
{
"epoch": 2.52350504239673,
"grad_norm": 1.3469390869140625,
"learning_rate": 3.738252092156375e-05,
"loss": 0.0495,
"num_input_tokens_seen": 280062464,
"step": 273500
},
{
"epoch": 2.5281183971360295,
"grad_norm": 1.247475266456604,
"learning_rate": 3.735945414786725e-05,
"loss": 0.0511,
"num_input_tokens_seen": 280574464,
"step": 274000
},
{
"epoch": 2.5327317518753287,
"grad_norm": 0.4033117890357971,
"learning_rate": 3.733638737417075e-05,
"loss": 0.0535,
"num_input_tokens_seen": 281086464,
"step": 274500
},
{
"epoch": 2.537345106614628,
"grad_norm": 1.1649394035339355,
"learning_rate": 3.7313320600474255e-05,
"loss": 0.0532,
"num_input_tokens_seen": 281598464,
"step": 275000
},
{
"epoch": 2.5419584613539272,
"grad_norm": 2.126436710357666,
"learning_rate": 3.729025382677776e-05,
"loss": 0.0485,
"num_input_tokens_seen": 282110464,
"step": 275500
},
{
"epoch": 2.546571816093227,
"grad_norm": 0.8005649447441101,
"learning_rate": 3.726718705308126e-05,
"loss": 0.0506,
"num_input_tokens_seen": 282622464,
"step": 276000
},
{
"epoch": 2.551185170832526,
"grad_norm": 2.3989765644073486,
"learning_rate": 3.7244120279384764e-05,
"loss": 0.0513,
"num_input_tokens_seen": 283134464,
"step": 276500
},
{
"epoch": 2.5557985255718254,
"grad_norm": 0.7040809988975525,
"learning_rate": 3.722105350568827e-05,
"loss": 0.0496,
"num_input_tokens_seen": 283646464,
"step": 277000
},
{
"epoch": 2.5604118803111247,
"grad_norm": 1.1335313320159912,
"learning_rate": 3.719798673199177e-05,
"loss": 0.0546,
"num_input_tokens_seen": 284158464,
"step": 277500
},
{
"epoch": 2.565025235050424,
"grad_norm": 0.9312555193901062,
"learning_rate": 3.717491995829527e-05,
"loss": 0.0516,
"num_input_tokens_seen": 284670464,
"step": 278000
},
{
"epoch": 2.569638589789723,
"grad_norm": 0.7695990800857544,
"learning_rate": 3.715185318459878e-05,
"loss": 0.0521,
"num_input_tokens_seen": 285182464,
"step": 278500
},
{
"epoch": 2.5742519445290224,
"grad_norm": 1.258518934249878,
"learning_rate": 3.712878641090228e-05,
"loss": 0.0499,
"num_input_tokens_seen": 285694464,
"step": 279000
},
{
"epoch": 2.578865299268322,
"grad_norm": 2.346951961517334,
"learning_rate": 3.710571963720578e-05,
"loss": 0.0472,
"num_input_tokens_seen": 286206464,
"step": 279500
},
{
"epoch": 2.5834786540076213,
"grad_norm": 0.8598672747612,
"learning_rate": 3.708265286350929e-05,
"loss": 0.0526,
"num_input_tokens_seen": 286718464,
"step": 280000
},
{
"epoch": 2.5880920087469206,
"grad_norm": 1.0490000247955322,
"learning_rate": 3.705958608981279e-05,
"loss": 0.0441,
"num_input_tokens_seen": 287230464,
"step": 280500
},
{
"epoch": 2.59270536348622,
"grad_norm": 0.49518364667892456,
"learning_rate": 3.7036519316116296e-05,
"loss": 0.0474,
"num_input_tokens_seen": 287742464,
"step": 281000
},
{
"epoch": 2.597318718225519,
"grad_norm": 1.5736312866210938,
"learning_rate": 3.70134525424198e-05,
"loss": 0.0509,
"num_input_tokens_seen": 288254464,
"step": 281500
},
{
"epoch": 2.6019320729648188,
"grad_norm": 2.511143445968628,
"learning_rate": 3.6990385768723304e-05,
"loss": 0.047,
"num_input_tokens_seen": 288766464,
"step": 282000
},
{
"epoch": 2.606545427704118,
"grad_norm": 0.9060021638870239,
"learning_rate": 3.6967318995026805e-05,
"loss": 0.053,
"num_input_tokens_seen": 289278464,
"step": 282500
},
{
"epoch": 2.6111587824434173,
"grad_norm": 1.4283766746520996,
"learning_rate": 3.694425222133031e-05,
"loss": 0.0476,
"num_input_tokens_seen": 289790464,
"step": 283000
},
{
"epoch": 2.6157721371827165,
"grad_norm": 1.5333555936813354,
"learning_rate": 3.692118544763381e-05,
"loss": 0.0538,
"num_input_tokens_seen": 290302464,
"step": 283500
},
{
"epoch": 2.6203854919220158,
"grad_norm": 1.615579605102539,
"learning_rate": 3.689811867393731e-05,
"loss": 0.0475,
"num_input_tokens_seen": 290814464,
"step": 284000
},
{
"epoch": 2.624998846661315,
"grad_norm": 1.5331679582595825,
"learning_rate": 3.687505190024082e-05,
"loss": 0.0471,
"num_input_tokens_seen": 291326464,
"step": 284500
},
{
"epoch": 2.6296122014006142,
"grad_norm": 2.3747360706329346,
"learning_rate": 3.685198512654433e-05,
"loss": 0.0477,
"num_input_tokens_seen": 291838464,
"step": 285000
},
{
"epoch": 2.634225556139914,
"grad_norm": 2.0471205711364746,
"learning_rate": 3.682891835284782e-05,
"loss": 0.0493,
"num_input_tokens_seen": 292350464,
"step": 285500
},
{
"epoch": 2.638838910879213,
"grad_norm": 1.0454156398773193,
"learning_rate": 3.680585157915133e-05,
"loss": 0.0467,
"num_input_tokens_seen": 292862464,
"step": 286000
},
{
"epoch": 2.6434522656185124,
"grad_norm": 2.0174975395202637,
"learning_rate": 3.678278480545484e-05,
"loss": 0.0526,
"num_input_tokens_seen": 293374464,
"step": 286500
},
{
"epoch": 2.6480656203578117,
"grad_norm": 1.8630324602127075,
"learning_rate": 3.675971803175833e-05,
"loss": 0.0489,
"num_input_tokens_seen": 293886464,
"step": 287000
},
{
"epoch": 2.6526789750971114,
"grad_norm": 2.270232915878296,
"learning_rate": 3.673665125806184e-05,
"loss": 0.0509,
"num_input_tokens_seen": 294398464,
"step": 287500
},
{
"epoch": 2.6572923298364106,
"grad_norm": 1.7369494438171387,
"learning_rate": 3.6713584484365345e-05,
"loss": 0.0504,
"num_input_tokens_seen": 294910464,
"step": 288000
},
{
"epoch": 2.66190568457571,
"grad_norm": 0.9229201078414917,
"learning_rate": 3.6690517710668846e-05,
"loss": 0.0467,
"num_input_tokens_seen": 295422464,
"step": 288500
},
{
"epoch": 2.666519039315009,
"grad_norm": 1.377439260482788,
"learning_rate": 3.6667450936972346e-05,
"loss": 0.0515,
"num_input_tokens_seen": 295934464,
"step": 289000
},
{
"epoch": 2.6711323940543084,
"grad_norm": 1.9601995944976807,
"learning_rate": 3.6644384163275854e-05,
"loss": 0.0527,
"num_input_tokens_seen": 296446464,
"step": 289500
},
{
"epoch": 2.6757457487936076,
"grad_norm": 1.4592013359069824,
"learning_rate": 3.6621317389579354e-05,
"loss": 0.0491,
"num_input_tokens_seen": 296958464,
"step": 290000
},
{
"epoch": 2.680359103532907,
"grad_norm": 0.35405218601226807,
"learning_rate": 3.659825061588286e-05,
"loss": 0.0472,
"num_input_tokens_seen": 297470464,
"step": 290500
},
{
"epoch": 2.6849724582722065,
"grad_norm": 1.9252680540084839,
"learning_rate": 3.657518384218636e-05,
"loss": 0.0469,
"num_input_tokens_seen": 297982464,
"step": 291000
},
{
"epoch": 2.689585813011506,
"grad_norm": 1.1235663890838623,
"learning_rate": 3.655211706848987e-05,
"loss": 0.0485,
"num_input_tokens_seen": 298494464,
"step": 291500
},
{
"epoch": 2.694199167750805,
"grad_norm": 0.9481515884399414,
"learning_rate": 3.652905029479337e-05,
"loss": 0.049,
"num_input_tokens_seen": 299006464,
"step": 292000
},
{
"epoch": 2.6988125224901043,
"grad_norm": 0.37934771180152893,
"learning_rate": 3.650598352109687e-05,
"loss": 0.052,
"num_input_tokens_seen": 299518464,
"step": 292500
},
{
"epoch": 2.7034258772294035,
"grad_norm": 1.1855201721191406,
"learning_rate": 3.648291674740038e-05,
"loss": 0.0492,
"num_input_tokens_seen": 300030464,
"step": 293000
},
{
"epoch": 2.708039231968703,
"grad_norm": 1.4538213014602661,
"learning_rate": 3.645984997370388e-05,
"loss": 0.0503,
"num_input_tokens_seen": 300542464,
"step": 293500
},
{
"epoch": 2.7126525867080025,
"grad_norm": 2.1017704010009766,
"learning_rate": 3.6436783200007386e-05,
"loss": 0.0458,
"num_input_tokens_seen": 301054464,
"step": 294000
},
{
"epoch": 2.7172659414473017,
"grad_norm": 0.6946723461151123,
"learning_rate": 3.6413716426310887e-05,
"loss": 0.0524,
"num_input_tokens_seen": 301566464,
"step": 294500
},
{
"epoch": 2.721879296186601,
"grad_norm": 3.0771243572235107,
"learning_rate": 3.639064965261439e-05,
"loss": 0.0518,
"num_input_tokens_seen": 302078464,
"step": 295000
},
{
"epoch": 2.7264926509259,
"grad_norm": 1.259162425994873,
"learning_rate": 3.6367582878917895e-05,
"loss": 0.0532,
"num_input_tokens_seen": 302590464,
"step": 295500
},
{
"epoch": 2.7311060056651995,
"grad_norm": 1.8771902322769165,
"learning_rate": 3.63445161052214e-05,
"loss": 0.0487,
"num_input_tokens_seen": 303102464,
"step": 296000
},
{
"epoch": 2.7357193604044987,
"grad_norm": 1.765956997871399,
"learning_rate": 3.6321449331524896e-05,
"loss": 0.0437,
"num_input_tokens_seen": 303614464,
"step": 296500
},
{
"epoch": 2.7403327151437984,
"grad_norm": 1.2610450983047485,
"learning_rate": 3.62983825578284e-05,
"loss": 0.044,
"num_input_tokens_seen": 304126464,
"step": 297000
},
{
"epoch": 2.7449460698830976,
"grad_norm": 4.452374458312988,
"learning_rate": 3.627531578413191e-05,
"loss": 0.0507,
"num_input_tokens_seen": 304638464,
"step": 297500
},
{
"epoch": 2.749559424622397,
"grad_norm": 1.082930088043213,
"learning_rate": 3.625224901043541e-05,
"loss": 0.0518,
"num_input_tokens_seen": 305150464,
"step": 298000
},
{
"epoch": 2.754172779361696,
"grad_norm": 0.708118200302124,
"learning_rate": 3.622918223673891e-05,
"loss": 0.0483,
"num_input_tokens_seen": 305662464,
"step": 298500
},
{
"epoch": 2.758786134100996,
"grad_norm": 1.1710622310638428,
"learning_rate": 3.620611546304242e-05,
"loss": 0.051,
"num_input_tokens_seen": 306174464,
"step": 299000
},
{
"epoch": 2.763399488840295,
"grad_norm": 2.388134002685547,
"learning_rate": 3.618304868934592e-05,
"loss": 0.0506,
"num_input_tokens_seen": 306686464,
"step": 299500
},
{
"epoch": 2.7680128435795943,
"grad_norm": 2.3141307830810547,
"learning_rate": 3.615998191564942e-05,
"loss": 0.0464,
"num_input_tokens_seen": 307198464,
"step": 300000
},
{
"epoch": 2.7726261983188936,
"grad_norm": 1.966213345527649,
"learning_rate": 3.613691514195293e-05,
"loss": 0.0501,
"num_input_tokens_seen": 307710464,
"step": 300500
},
{
"epoch": 2.777239553058193,
"grad_norm": 3.948702573776245,
"learning_rate": 3.611384836825643e-05,
"loss": 0.0495,
"num_input_tokens_seen": 308222464,
"step": 301000
},
{
"epoch": 2.781852907797492,
"grad_norm": 1.3868130445480347,
"learning_rate": 3.6090781594559935e-05,
"loss": 0.0471,
"num_input_tokens_seen": 308734464,
"step": 301500
},
{
"epoch": 2.7864662625367913,
"grad_norm": 1.42705500125885,
"learning_rate": 3.6067714820863436e-05,
"loss": 0.0474,
"num_input_tokens_seen": 309246464,
"step": 302000
},
{
"epoch": 2.7910796172760906,
"grad_norm": 1.4073491096496582,
"learning_rate": 3.604464804716694e-05,
"loss": 0.0459,
"num_input_tokens_seen": 309758464,
"step": 302500
},
{
"epoch": 2.7956929720153902,
"grad_norm": 1.990958333015442,
"learning_rate": 3.6021581273470444e-05,
"loss": 0.0461,
"num_input_tokens_seen": 310270464,
"step": 303000
},
{
"epoch": 2.8003063267546895,
"grad_norm": 2.2346065044403076,
"learning_rate": 3.599851449977395e-05,
"loss": 0.0534,
"num_input_tokens_seen": 310782464,
"step": 303500
},
{
"epoch": 2.8049196814939887,
"grad_norm": 1.1180897951126099,
"learning_rate": 3.597544772607745e-05,
"loss": 0.0459,
"num_input_tokens_seen": 311294464,
"step": 304000
},
{
"epoch": 2.809533036233288,
"grad_norm": 1.765995979309082,
"learning_rate": 3.595238095238095e-05,
"loss": 0.0443,
"num_input_tokens_seen": 311806464,
"step": 304500
},
{
"epoch": 2.8141463909725877,
"grad_norm": 0.6811426877975464,
"learning_rate": 3.592931417868446e-05,
"loss": 0.0488,
"num_input_tokens_seen": 312318464,
"step": 305000
},
{
"epoch": 2.818759745711887,
"grad_norm": 2.811584234237671,
"learning_rate": 3.590624740498796e-05,
"loss": 0.0517,
"num_input_tokens_seen": 312830464,
"step": 305500
},
{
"epoch": 2.823373100451186,
"grad_norm": 2.9501793384552,
"learning_rate": 3.588318063129146e-05,
"loss": 0.0537,
"num_input_tokens_seen": 313342464,
"step": 306000
},
{
"epoch": 2.8279864551904854,
"grad_norm": 0.9767802357673645,
"learning_rate": 3.586011385759497e-05,
"loss": 0.0473,
"num_input_tokens_seen": 313854464,
"step": 306500
},
{
"epoch": 2.8325998099297847,
"grad_norm": 1.463254451751709,
"learning_rate": 3.5837047083898476e-05,
"loss": 0.0498,
"num_input_tokens_seen": 314366464,
"step": 307000
},
{
"epoch": 2.837213164669084,
"grad_norm": 1.6375666856765747,
"learning_rate": 3.581398031020197e-05,
"loss": 0.0494,
"num_input_tokens_seen": 314878464,
"step": 307500
},
{
"epoch": 2.841826519408383,
"grad_norm": 6.093188285827637,
"learning_rate": 3.579091353650548e-05,
"loss": 0.0505,
"num_input_tokens_seen": 315390464,
"step": 308000
},
{
"epoch": 2.846439874147683,
"grad_norm": 1.2764623165130615,
"learning_rate": 3.5767846762808984e-05,
"loss": 0.0529,
"num_input_tokens_seen": 315902464,
"step": 308500
},
{
"epoch": 2.851053228886982,
"grad_norm": 0.9110862612724304,
"learning_rate": 3.5744779989112485e-05,
"loss": 0.0486,
"num_input_tokens_seen": 316414464,
"step": 309000
},
{
"epoch": 2.8556665836262813,
"grad_norm": 1.6029491424560547,
"learning_rate": 3.5721713215415985e-05,
"loss": 0.0524,
"num_input_tokens_seen": 316926464,
"step": 309500
},
{
"epoch": 2.8602799383655806,
"grad_norm": 1.162832498550415,
"learning_rate": 3.569864644171949e-05,
"loss": 0.0497,
"num_input_tokens_seen": 317438464,
"step": 310000
},
{
"epoch": 2.8648932931048803,
"grad_norm": 0.8766358494758606,
"learning_rate": 3.567557966802299e-05,
"loss": 0.0529,
"num_input_tokens_seen": 317950464,
"step": 310500
},
{
"epoch": 2.8695066478441795,
"grad_norm": 1.384810209274292,
"learning_rate": 3.56525128943265e-05,
"loss": 0.0495,
"num_input_tokens_seen": 318462464,
"step": 311000
},
{
"epoch": 2.8741200025834788,
"grad_norm": 3.1389269828796387,
"learning_rate": 3.562944612063e-05,
"loss": 0.0495,
"num_input_tokens_seen": 318974464,
"step": 311500
},
{
"epoch": 2.878733357322778,
"grad_norm": 2.004563570022583,
"learning_rate": 3.56063793469335e-05,
"loss": 0.0498,
"num_input_tokens_seen": 319486464,
"step": 312000
},
{
"epoch": 2.8833467120620773,
"grad_norm": 2.8419971466064453,
"learning_rate": 3.558331257323701e-05,
"loss": 0.0497,
"num_input_tokens_seen": 319998464,
"step": 312500
},
{
"epoch": 2.8879600668013765,
"grad_norm": 1.0195252895355225,
"learning_rate": 3.556024579954051e-05,
"loss": 0.0496,
"num_input_tokens_seen": 320510464,
"step": 313000
},
{
"epoch": 2.8925734215406758,
"grad_norm": 1.6460163593292236,
"learning_rate": 3.553717902584402e-05,
"loss": 0.0465,
"num_input_tokens_seen": 321022464,
"step": 313500
},
{
"epoch": 2.897186776279975,
"grad_norm": 0.9986339211463928,
"learning_rate": 3.551411225214752e-05,
"loss": 0.0494,
"num_input_tokens_seen": 321534464,
"step": 314000
},
{
"epoch": 2.9018001310192747,
"grad_norm": 0.7910524606704712,
"learning_rate": 3.5491045478451025e-05,
"loss": 0.0488,
"num_input_tokens_seen": 322046464,
"step": 314500
},
{
"epoch": 2.906413485758574,
"grad_norm": 0.8609081506729126,
"learning_rate": 3.5467978704754526e-05,
"loss": 0.0522,
"num_input_tokens_seen": 322558464,
"step": 315000
},
{
"epoch": 2.911026840497873,
"grad_norm": 0.49892082810401917,
"learning_rate": 3.5444911931058026e-05,
"loss": 0.0471,
"num_input_tokens_seen": 323070464,
"step": 315500
},
{
"epoch": 2.9156401952371724,
"grad_norm": 1.161789894104004,
"learning_rate": 3.5421845157361534e-05,
"loss": 0.0519,
"num_input_tokens_seen": 323582464,
"step": 316000
},
{
"epoch": 2.920253549976472,
"grad_norm": 2.9082627296447754,
"learning_rate": 3.539877838366504e-05,
"loss": 0.0517,
"num_input_tokens_seen": 324094464,
"step": 316500
},
{
"epoch": 2.9248669047157714,
"grad_norm": 2.1669368743896484,
"learning_rate": 3.5375711609968535e-05,
"loss": 0.0506,
"num_input_tokens_seen": 324606464,
"step": 317000
},
{
"epoch": 2.9294802594550706,
"grad_norm": 0.955956220626831,
"learning_rate": 3.535264483627204e-05,
"loss": 0.0508,
"num_input_tokens_seen": 325118464,
"step": 317500
},
{
"epoch": 2.93409361419437,
"grad_norm": 1.6256439685821533,
"learning_rate": 3.532957806257555e-05,
"loss": 0.0468,
"num_input_tokens_seen": 325630464,
"step": 318000
},
{
"epoch": 2.938706968933669,
"grad_norm": 1.479632019996643,
"learning_rate": 3.530651128887904e-05,
"loss": 0.0468,
"num_input_tokens_seen": 326142464,
"step": 318500
},
{
"epoch": 2.9433203236729684,
"grad_norm": 0.8990212082862854,
"learning_rate": 3.528344451518255e-05,
"loss": 0.0515,
"num_input_tokens_seen": 326654464,
"step": 319000
},
{
"epoch": 2.9479336784122676,
"grad_norm": 0.5225000381469727,
"learning_rate": 3.526037774148606e-05,
"loss": 0.0474,
"num_input_tokens_seen": 327166464,
"step": 319500
},
{
"epoch": 2.9525470331515673,
"grad_norm": 0.6462964415550232,
"learning_rate": 3.523731096778956e-05,
"loss": 0.0523,
"num_input_tokens_seen": 327678464,
"step": 320000
},
{
"epoch": 2.9571603878908665,
"grad_norm": 1.1759368181228638,
"learning_rate": 3.521424419409306e-05,
"loss": 0.0485,
"num_input_tokens_seen": 328190464,
"step": 320500
},
{
"epoch": 2.961773742630166,
"grad_norm": 0.6114454865455627,
"learning_rate": 3.5191177420396567e-05,
"loss": 0.0522,
"num_input_tokens_seen": 328702464,
"step": 321000
},
{
"epoch": 2.966387097369465,
"grad_norm": 0.8368657231330872,
"learning_rate": 3.516811064670007e-05,
"loss": 0.0468,
"num_input_tokens_seen": 329214464,
"step": 321500
},
{
"epoch": 2.9710004521087643,
"grad_norm": 0.39750799536705017,
"learning_rate": 3.5145043873003574e-05,
"loss": 0.0491,
"num_input_tokens_seen": 329726464,
"step": 322000
},
{
"epoch": 2.975613806848064,
"grad_norm": 1.4396777153015137,
"learning_rate": 3.5121977099307075e-05,
"loss": 0.0486,
"num_input_tokens_seen": 330238464,
"step": 322500
},
{
"epoch": 2.9802271615873632,
"grad_norm": 6.470019817352295,
"learning_rate": 3.5098910325610576e-05,
"loss": 0.0466,
"num_input_tokens_seen": 330750464,
"step": 323000
},
{
"epoch": 2.9848405163266625,
"grad_norm": 0.8978260159492493,
"learning_rate": 3.507584355191408e-05,
"loss": 0.051,
"num_input_tokens_seen": 331262464,
"step": 323500
},
{
"epoch": 2.9894538710659617,
"grad_norm": 1.2832305431365967,
"learning_rate": 3.505277677821759e-05,
"loss": 0.05,
"num_input_tokens_seen": 331774464,
"step": 324000
},
{
"epoch": 2.994067225805261,
"grad_norm": 1.4465861320495605,
"learning_rate": 3.502971000452109e-05,
"loss": 0.0491,
"num_input_tokens_seen": 332286464,
"step": 324500
},
{
"epoch": 2.99868058054456,
"grad_norm": 0.7884268164634705,
"learning_rate": 3.500664323082459e-05,
"loss": 0.0559,
"num_input_tokens_seen": 332798464,
"step": 325000
},
{
"epoch": 3.0,
"eval_combined_score": 0.07028037235137267,
"eval_loss": 0.07028037309646606,
"eval_mse": 0.07028037160627928,
"eval_runtime": 46.6351,
"eval_samples_per_second": 2065.784,
"eval_steps_per_second": 258.239,
"num_input_tokens_seen": 332944128,
"step": 325143
},
{
"epoch": 3.00329393528386,
"grad_norm": 1.5264211893081665,
"learning_rate": 3.49835764571281e-05,
"loss": 0.0458,
"num_input_tokens_seen": 333309696,
"step": 325500
},
{
"epoch": 3.007907290023159,
"grad_norm": 0.4709686040878296,
"learning_rate": 3.49605096834316e-05,
"loss": 0.0373,
"num_input_tokens_seen": 333821696,
"step": 326000
},
{
"epoch": 3.0125206447624584,
"grad_norm": 1.1726654767990112,
"learning_rate": 3.49374429097351e-05,
"loss": 0.0367,
"num_input_tokens_seen": 334333696,
"step": 326500
},
{
"epoch": 3.0171339995017576,
"grad_norm": 0.5303038358688354,
"learning_rate": 3.491437613603861e-05,
"loss": 0.0398,
"num_input_tokens_seen": 334845696,
"step": 327000
},
{
"epoch": 3.021747354241057,
"grad_norm": 1.8502370119094849,
"learning_rate": 3.4891309362342115e-05,
"loss": 0.0344,
"num_input_tokens_seen": 335357696,
"step": 327500
},
{
"epoch": 3.026360708980356,
"grad_norm": 0.6410061120986938,
"learning_rate": 3.486824258864561e-05,
"loss": 0.0387,
"num_input_tokens_seen": 335869696,
"step": 328000
},
{
"epoch": 3.030974063719656,
"grad_norm": 2.9425787925720215,
"learning_rate": 3.4845175814949116e-05,
"loss": 0.0408,
"num_input_tokens_seen": 336381696,
"step": 328500
},
{
"epoch": 3.035587418458955,
"grad_norm": 3.2158591747283936,
"learning_rate": 3.482210904125262e-05,
"loss": 0.039,
"num_input_tokens_seen": 336893696,
"step": 329000
},
{
"epoch": 3.0402007731982543,
"grad_norm": 1.0993469953536987,
"learning_rate": 3.4799042267556124e-05,
"loss": 0.0427,
"num_input_tokens_seen": 337405696,
"step": 329500
},
{
"epoch": 3.0448141279375536,
"grad_norm": 0.733238697052002,
"learning_rate": 3.4775975493859624e-05,
"loss": 0.0364,
"num_input_tokens_seen": 337917696,
"step": 330000
},
{
"epoch": 3.049427482676853,
"grad_norm": 1.7866772413253784,
"learning_rate": 3.475290872016313e-05,
"loss": 0.0367,
"num_input_tokens_seen": 338429696,
"step": 330500
},
{
"epoch": 3.054040837416152,
"grad_norm": 2.1485824584960938,
"learning_rate": 3.472984194646663e-05,
"loss": 0.0375,
"num_input_tokens_seen": 338941696,
"step": 331000
},
{
"epoch": 3.0586541921554518,
"grad_norm": 0.9480071663856506,
"learning_rate": 3.470677517277013e-05,
"loss": 0.0361,
"num_input_tokens_seen": 339453696,
"step": 331500
},
{
"epoch": 3.063267546894751,
"grad_norm": 1.3875316381454468,
"learning_rate": 3.468370839907364e-05,
"loss": 0.04,
"num_input_tokens_seen": 339965696,
"step": 332000
},
{
"epoch": 3.0678809016340503,
"grad_norm": 1.2781360149383545,
"learning_rate": 3.466064162537714e-05,
"loss": 0.0407,
"num_input_tokens_seen": 340477696,
"step": 332500
},
{
"epoch": 3.0724942563733495,
"grad_norm": 1.129167079925537,
"learning_rate": 3.463757485168065e-05,
"loss": 0.0386,
"num_input_tokens_seen": 340989696,
"step": 333000
},
{
"epoch": 3.0771076111126487,
"grad_norm": 1.3005669116973877,
"learning_rate": 3.461450807798415e-05,
"loss": 0.0389,
"num_input_tokens_seen": 341501696,
"step": 333500
},
{
"epoch": 3.0817209658519484,
"grad_norm": 1.7916690111160278,
"learning_rate": 3.4591441304287656e-05,
"loss": 0.0357,
"num_input_tokens_seen": 342013696,
"step": 334000
},
{
"epoch": 3.0863343205912477,
"grad_norm": 0.6907594799995422,
"learning_rate": 3.456837453059116e-05,
"loss": 0.0408,
"num_input_tokens_seen": 342525696,
"step": 334500
},
{
"epoch": 3.090947675330547,
"grad_norm": 1.9678852558135986,
"learning_rate": 3.4545307756894664e-05,
"loss": 0.0394,
"num_input_tokens_seen": 343037696,
"step": 335000
},
{
"epoch": 3.095561030069846,
"grad_norm": 2.437412977218628,
"learning_rate": 3.4522240983198165e-05,
"loss": 0.0374,
"num_input_tokens_seen": 343549696,
"step": 335500
},
{
"epoch": 3.1001743848091454,
"grad_norm": 0.7736024260520935,
"learning_rate": 3.4499174209501665e-05,
"loss": 0.0398,
"num_input_tokens_seen": 344061696,
"step": 336000
},
{
"epoch": 3.1047877395484447,
"grad_norm": 1.619535207748413,
"learning_rate": 3.447610743580517e-05,
"loss": 0.0407,
"num_input_tokens_seen": 344573696,
"step": 336500
},
{
"epoch": 3.1094010942877444,
"grad_norm": 0.7229686975479126,
"learning_rate": 3.445304066210867e-05,
"loss": 0.035,
"num_input_tokens_seen": 345085696,
"step": 337000
},
{
"epoch": 3.1140144490270436,
"grad_norm": 0.757798433303833,
"learning_rate": 3.4429973888412174e-05,
"loss": 0.0356,
"num_input_tokens_seen": 345597696,
"step": 337500
},
{
"epoch": 3.118627803766343,
"grad_norm": 1.478723168373108,
"learning_rate": 3.440690711471568e-05,
"loss": 0.0375,
"num_input_tokens_seen": 346109696,
"step": 338000
},
{
"epoch": 3.123241158505642,
"grad_norm": 1.482269525527954,
"learning_rate": 3.438384034101919e-05,
"loss": 0.0382,
"num_input_tokens_seen": 346621696,
"step": 338500
},
{
"epoch": 3.1278545132449413,
"grad_norm": 1.0418490171432495,
"learning_rate": 3.436077356732268e-05,
"loss": 0.0364,
"num_input_tokens_seen": 347133696,
"step": 339000
},
{
"epoch": 3.1324678679842406,
"grad_norm": 0.8459765911102295,
"learning_rate": 3.433770679362619e-05,
"loss": 0.0355,
"num_input_tokens_seen": 347645696,
"step": 339500
},
{
"epoch": 3.1370812227235403,
"grad_norm": 0.91368168592453,
"learning_rate": 3.43146400199297e-05,
"loss": 0.0384,
"num_input_tokens_seen": 348157696,
"step": 340000
},
{
"epoch": 3.1416945774628395,
"grad_norm": 1.1992415189743042,
"learning_rate": 3.42915732462332e-05,
"loss": 0.0402,
"num_input_tokens_seen": 348669696,
"step": 340500
},
{
"epoch": 3.146307932202139,
"grad_norm": 1.1619198322296143,
"learning_rate": 3.42685064725367e-05,
"loss": 0.0401,
"num_input_tokens_seen": 349181696,
"step": 341000
},
{
"epoch": 3.150921286941438,
"grad_norm": 0.8243937492370605,
"learning_rate": 3.4245439698840206e-05,
"loss": 0.039,
"num_input_tokens_seen": 349693696,
"step": 341500
},
{
"epoch": 3.1555346416807373,
"grad_norm": 1.217475175857544,
"learning_rate": 3.4222372925143706e-05,
"loss": 0.0392,
"num_input_tokens_seen": 350205696,
"step": 342000
},
{
"epoch": 3.1601479964200365,
"grad_norm": 1.7150335311889648,
"learning_rate": 3.4199306151447214e-05,
"loss": 0.0352,
"num_input_tokens_seen": 350717696,
"step": 342500
},
{
"epoch": 3.164761351159336,
"grad_norm": 0.892362117767334,
"learning_rate": 3.4176239377750714e-05,
"loss": 0.0403,
"num_input_tokens_seen": 351229696,
"step": 343000
},
{
"epoch": 3.1693747058986355,
"grad_norm": 0.5353464484214783,
"learning_rate": 3.4153172604054215e-05,
"loss": 0.0378,
"num_input_tokens_seen": 351741696,
"step": 343500
},
{
"epoch": 3.1739880606379347,
"grad_norm": 1.603272557258606,
"learning_rate": 3.413010583035772e-05,
"loss": 0.0401,
"num_input_tokens_seen": 352253696,
"step": 344000
},
{
"epoch": 3.178601415377234,
"grad_norm": 1.0198638439178467,
"learning_rate": 3.410703905666122e-05,
"loss": 0.0364,
"num_input_tokens_seen": 352765696,
"step": 344500
},
{
"epoch": 3.183214770116533,
"grad_norm": 0.7820620536804199,
"learning_rate": 3.408397228296473e-05,
"loss": 0.038,
"num_input_tokens_seen": 353277696,
"step": 345000
},
{
"epoch": 3.187828124855833,
"grad_norm": 1.567887306213379,
"learning_rate": 3.406090550926823e-05,
"loss": 0.0368,
"num_input_tokens_seen": 353789696,
"step": 345500
},
{
"epoch": 3.192441479595132,
"grad_norm": 1.5703437328338623,
"learning_rate": 3.403783873557174e-05,
"loss": 0.0385,
"num_input_tokens_seen": 354301696,
"step": 346000
},
{
"epoch": 3.1970548343344314,
"grad_norm": 0.5745303630828857,
"learning_rate": 3.401477196187524e-05,
"loss": 0.0368,
"num_input_tokens_seen": 354813696,
"step": 346500
},
{
"epoch": 3.2016681890737306,
"grad_norm": 0.9760965704917908,
"learning_rate": 3.399170518817874e-05,
"loss": 0.0414,
"num_input_tokens_seen": 355325696,
"step": 347000
},
{
"epoch": 3.20628154381303,
"grad_norm": 1.1067168712615967,
"learning_rate": 3.3968638414482246e-05,
"loss": 0.0379,
"num_input_tokens_seen": 355837696,
"step": 347500
},
{
"epoch": 3.210894898552329,
"grad_norm": 1.1161097288131714,
"learning_rate": 3.3945571640785754e-05,
"loss": 0.0384,
"num_input_tokens_seen": 356349696,
"step": 348000
},
{
"epoch": 3.2155082532916284,
"grad_norm": 2.1467411518096924,
"learning_rate": 3.392250486708925e-05,
"loss": 0.0387,
"num_input_tokens_seen": 356861696,
"step": 348500
},
{
"epoch": 3.220121608030928,
"grad_norm": 1.2950456142425537,
"learning_rate": 3.3899438093392755e-05,
"loss": 0.0362,
"num_input_tokens_seen": 357373696,
"step": 349000
},
{
"epoch": 3.2247349627702273,
"grad_norm": 1.0559481382369995,
"learning_rate": 3.387637131969626e-05,
"loss": 0.0395,
"num_input_tokens_seen": 357885696,
"step": 349500
},
{
"epoch": 3.2293483175095266,
"grad_norm": 1.2557491064071655,
"learning_rate": 3.385330454599976e-05,
"loss": 0.0409,
"num_input_tokens_seen": 358397696,
"step": 350000
},
{
"epoch": 3.233961672248826,
"grad_norm": 0.9372035264968872,
"learning_rate": 3.3830237772303264e-05,
"loss": 0.0404,
"num_input_tokens_seen": 358909696,
"step": 350500
},
{
"epoch": 3.238575026988125,
"grad_norm": 0.6541593670845032,
"learning_rate": 3.380717099860677e-05,
"loss": 0.0376,
"num_input_tokens_seen": 359421696,
"step": 351000
},
{
"epoch": 3.2431883817274247,
"grad_norm": 0.9174505472183228,
"learning_rate": 3.378410422491027e-05,
"loss": 0.0403,
"num_input_tokens_seen": 359933696,
"step": 351500
},
{
"epoch": 3.247801736466724,
"grad_norm": 0.9051727056503296,
"learning_rate": 3.376103745121377e-05,
"loss": 0.0375,
"num_input_tokens_seen": 360445696,
"step": 352000
},
{
"epoch": 3.2524150912060232,
"grad_norm": 1.1875522136688232,
"learning_rate": 3.373797067751728e-05,
"loss": 0.0431,
"num_input_tokens_seen": 360957696,
"step": 352500
},
{
"epoch": 3.2570284459453225,
"grad_norm": 0.1862681657075882,
"learning_rate": 3.371490390382078e-05,
"loss": 0.0385,
"num_input_tokens_seen": 361469696,
"step": 353000
},
{
"epoch": 3.2616418006846217,
"grad_norm": 1.5912601947784424,
"learning_rate": 3.369183713012429e-05,
"loss": 0.0371,
"num_input_tokens_seen": 361981696,
"step": 353500
},
{
"epoch": 3.266255155423921,
"grad_norm": 1.4725751876831055,
"learning_rate": 3.366877035642779e-05,
"loss": 0.0417,
"num_input_tokens_seen": 362493696,
"step": 354000
},
{
"epoch": 3.2708685101632207,
"grad_norm": 0.7821846604347229,
"learning_rate": 3.364570358273129e-05,
"loss": 0.0371,
"num_input_tokens_seen": 363005696,
"step": 354500
},
{
"epoch": 3.27548186490252,
"grad_norm": 1.3403239250183105,
"learning_rate": 3.3622636809034796e-05,
"loss": 0.0437,
"num_input_tokens_seen": 363517696,
"step": 355000
},
{
"epoch": 3.280095219641819,
"grad_norm": 1.3142443895339966,
"learning_rate": 3.35995700353383e-05,
"loss": 0.0424,
"num_input_tokens_seen": 364029696,
"step": 355500
},
{
"epoch": 3.2847085743811184,
"grad_norm": 0.7003629207611084,
"learning_rate": 3.3576503261641804e-05,
"loss": 0.038,
"num_input_tokens_seen": 364541696,
"step": 356000
},
{
"epoch": 3.2893219291204177,
"grad_norm": 2.1016480922698975,
"learning_rate": 3.3553436487945304e-05,
"loss": 0.0389,
"num_input_tokens_seen": 365053696,
"step": 356500
},
{
"epoch": 3.2939352838597173,
"grad_norm": 0.9255128502845764,
"learning_rate": 3.353036971424881e-05,
"loss": 0.0414,
"num_input_tokens_seen": 365565696,
"step": 357000
},
{
"epoch": 3.2985486385990166,
"grad_norm": 2.0615665912628174,
"learning_rate": 3.350730294055231e-05,
"loss": 0.0376,
"num_input_tokens_seen": 366077696,
"step": 357500
},
{
"epoch": 3.303161993338316,
"grad_norm": 0.5057035088539124,
"learning_rate": 3.348423616685581e-05,
"loss": 0.0441,
"num_input_tokens_seen": 366589696,
"step": 358000
},
{
"epoch": 3.307775348077615,
"grad_norm": 2.8129680156707764,
"learning_rate": 3.346116939315932e-05,
"loss": 0.0368,
"num_input_tokens_seen": 367101696,
"step": 358500
},
{
"epoch": 3.3123887028169143,
"grad_norm": 2.223184823989868,
"learning_rate": 3.343810261946283e-05,
"loss": 0.0423,
"num_input_tokens_seen": 367613696,
"step": 359000
},
{
"epoch": 3.3170020575562136,
"grad_norm": 1.127394199371338,
"learning_rate": 3.341503584576632e-05,
"loss": 0.0397,
"num_input_tokens_seen": 368125696,
"step": 359500
},
{
"epoch": 3.321615412295513,
"grad_norm": 2.887812376022339,
"learning_rate": 3.339196907206983e-05,
"loss": 0.0379,
"num_input_tokens_seen": 368637696,
"step": 360000
},
{
"epoch": 3.3262287670348125,
"grad_norm": 1.08502197265625,
"learning_rate": 3.3368902298373336e-05,
"loss": 0.0421,
"num_input_tokens_seen": 369149696,
"step": 360500
},
{
"epoch": 3.3308421217741118,
"grad_norm": 1.0474424362182617,
"learning_rate": 3.334583552467684e-05,
"loss": 0.04,
"num_input_tokens_seen": 369661696,
"step": 361000
},
{
"epoch": 3.335455476513411,
"grad_norm": 0.7261756658554077,
"learning_rate": 3.332276875098034e-05,
"loss": 0.0409,
"num_input_tokens_seen": 370173696,
"step": 361500
},
{
"epoch": 3.3400688312527103,
"grad_norm": 0.6790010929107666,
"learning_rate": 3.3299701977283845e-05,
"loss": 0.0403,
"num_input_tokens_seen": 370685696,
"step": 362000
},
{
"epoch": 3.3446821859920095,
"grad_norm": 1.7215800285339355,
"learning_rate": 3.3276635203587345e-05,
"loss": 0.0411,
"num_input_tokens_seen": 371197696,
"step": 362500
},
{
"epoch": 3.349295540731309,
"grad_norm": 1.112464189529419,
"learning_rate": 3.325356842989085e-05,
"loss": 0.0421,
"num_input_tokens_seen": 371709696,
"step": 363000
},
{
"epoch": 3.3539088954706084,
"grad_norm": 1.0138994455337524,
"learning_rate": 3.323050165619435e-05,
"loss": 0.0369,
"num_input_tokens_seen": 372221696,
"step": 363500
},
{
"epoch": 3.3585222502099077,
"grad_norm": 0.584247887134552,
"learning_rate": 3.3207434882497854e-05,
"loss": 0.0402,
"num_input_tokens_seen": 372733696,
"step": 364000
},
{
"epoch": 3.363135604949207,
"grad_norm": 1.9375905990600586,
"learning_rate": 3.318436810880136e-05,
"loss": 0.0359,
"num_input_tokens_seen": 373245696,
"step": 364500
},
{
"epoch": 3.367748959688506,
"grad_norm": 1.225064992904663,
"learning_rate": 3.316130133510486e-05,
"loss": 0.0378,
"num_input_tokens_seen": 373757696,
"step": 365000
},
{
"epoch": 3.3723623144278054,
"grad_norm": 1.0532304048538208,
"learning_rate": 3.313823456140836e-05,
"loss": 0.0422,
"num_input_tokens_seen": 374269696,
"step": 365500
},
{
"epoch": 3.376975669167105,
"grad_norm": 0.950737714767456,
"learning_rate": 3.311516778771187e-05,
"loss": 0.0385,
"num_input_tokens_seen": 374781696,
"step": 366000
},
{
"epoch": 3.3815890239064044,
"grad_norm": 0.340679794549942,
"learning_rate": 3.309210101401538e-05,
"loss": 0.0364,
"num_input_tokens_seen": 375293696,
"step": 366500
},
{
"epoch": 3.3862023786457036,
"grad_norm": 4.747739791870117,
"learning_rate": 3.306903424031888e-05,
"loss": 0.0354,
"num_input_tokens_seen": 375805696,
"step": 367000
},
{
"epoch": 3.390815733385003,
"grad_norm": 1.7227208614349365,
"learning_rate": 3.304596746662238e-05,
"loss": 0.0413,
"num_input_tokens_seen": 376317696,
"step": 367500
},
{
"epoch": 3.395429088124302,
"grad_norm": 1.4410547018051147,
"learning_rate": 3.3022900692925886e-05,
"loss": 0.0359,
"num_input_tokens_seen": 376829696,
"step": 368000
},
{
"epoch": 3.400042442863602,
"grad_norm": 0.847284197807312,
"learning_rate": 3.2999833919229386e-05,
"loss": 0.0437,
"num_input_tokens_seen": 377341696,
"step": 368500
},
{
"epoch": 3.404655797602901,
"grad_norm": 1.7439848184585571,
"learning_rate": 3.297676714553289e-05,
"loss": 0.0362,
"num_input_tokens_seen": 377853696,
"step": 369000
},
{
"epoch": 3.4092691523422003,
"grad_norm": 0.6023704409599304,
"learning_rate": 3.2953700371836394e-05,
"loss": 0.0418,
"num_input_tokens_seen": 378365696,
"step": 369500
},
{
"epoch": 3.4138825070814995,
"grad_norm": 0.3590753972530365,
"learning_rate": 3.29306335981399e-05,
"loss": 0.0402,
"num_input_tokens_seen": 378877696,
"step": 370000
},
{
"epoch": 3.418495861820799,
"grad_norm": 1.0211530923843384,
"learning_rate": 3.2907566824443395e-05,
"loss": 0.0374,
"num_input_tokens_seen": 379389696,
"step": 370500
},
{
"epoch": 3.423109216560098,
"grad_norm": 0.9513002038002014,
"learning_rate": 3.28845000507469e-05,
"loss": 0.0401,
"num_input_tokens_seen": 379901696,
"step": 371000
},
{
"epoch": 3.4277225712993973,
"grad_norm": 1.0161465406417847,
"learning_rate": 3.286143327705041e-05,
"loss": 0.0403,
"num_input_tokens_seen": 380413696,
"step": 371500
},
{
"epoch": 3.432335926038697,
"grad_norm": 1.2249014377593994,
"learning_rate": 3.283836650335391e-05,
"loss": 0.0401,
"num_input_tokens_seen": 380925696,
"step": 372000
},
{
"epoch": 3.436949280777996,
"grad_norm": 1.3249224424362183,
"learning_rate": 3.281529972965741e-05,
"loss": 0.0414,
"num_input_tokens_seen": 381437696,
"step": 372500
},
{
"epoch": 3.4415626355172955,
"grad_norm": 3.6392204761505127,
"learning_rate": 3.279223295596092e-05,
"loss": 0.0367,
"num_input_tokens_seen": 381949696,
"step": 373000
},
{
"epoch": 3.4461759902565947,
"grad_norm": 0.9922639727592468,
"learning_rate": 3.276916618226442e-05,
"loss": 0.0418,
"num_input_tokens_seen": 382461696,
"step": 373500
},
{
"epoch": 3.450789344995894,
"grad_norm": 2.1645193099975586,
"learning_rate": 3.2746099408567926e-05,
"loss": 0.0382,
"num_input_tokens_seen": 382973696,
"step": 374000
},
{
"epoch": 3.4554026997351937,
"grad_norm": 2.5222291946411133,
"learning_rate": 3.272303263487143e-05,
"loss": 0.0399,
"num_input_tokens_seen": 383485696,
"step": 374500
},
{
"epoch": 3.460016054474493,
"grad_norm": 2.2609009742736816,
"learning_rate": 3.269996586117493e-05,
"loss": 0.0395,
"num_input_tokens_seen": 383997696,
"step": 375000
},
{
"epoch": 3.464629409213792,
"grad_norm": 3.2856132984161377,
"learning_rate": 3.2676899087478435e-05,
"loss": 0.0391,
"num_input_tokens_seen": 384509696,
"step": 375500
},
{
"epoch": 3.4692427639530914,
"grad_norm": 0.6138939261436462,
"learning_rate": 3.265383231378194e-05,
"loss": 0.0398,
"num_input_tokens_seen": 385021696,
"step": 376000
},
{
"epoch": 3.4738561186923906,
"grad_norm": 1.3824810981750488,
"learning_rate": 3.263076554008544e-05,
"loss": 0.0374,
"num_input_tokens_seen": 385533696,
"step": 376500
},
{
"epoch": 3.47846947343169,
"grad_norm": 1.539600133895874,
"learning_rate": 3.2607698766388943e-05,
"loss": 0.0397,
"num_input_tokens_seen": 386045696,
"step": 377000
},
{
"epoch": 3.483082828170989,
"grad_norm": 0.7915021181106567,
"learning_rate": 3.258463199269245e-05,
"loss": 0.0408,
"num_input_tokens_seen": 386557696,
"step": 377500
},
{
"epoch": 3.487696182910289,
"grad_norm": 1.5975933074951172,
"learning_rate": 3.256156521899595e-05,
"loss": 0.0382,
"num_input_tokens_seen": 387069696,
"step": 378000
},
{
"epoch": 3.492309537649588,
"grad_norm": 1.8749665021896362,
"learning_rate": 3.253849844529945e-05,
"loss": 0.0407,
"num_input_tokens_seen": 387581696,
"step": 378500
},
{
"epoch": 3.4969228923888873,
"grad_norm": 1.7674627304077148,
"learning_rate": 3.251543167160296e-05,
"loss": 0.04,
"num_input_tokens_seen": 388093696,
"step": 379000
},
{
"epoch": 3.5015362471281866,
"grad_norm": 0.8147306442260742,
"learning_rate": 3.249236489790646e-05,
"loss": 0.04,
"num_input_tokens_seen": 388605696,
"step": 379500
},
{
"epoch": 3.5061496018674863,
"grad_norm": 0.7411497235298157,
"learning_rate": 3.246929812420996e-05,
"loss": 0.0394,
"num_input_tokens_seen": 389117696,
"step": 380000
},
{
"epoch": 3.5107629566067855,
"grad_norm": 1.145559549331665,
"learning_rate": 3.244623135051347e-05,
"loss": 0.0432,
"num_input_tokens_seen": 389629696,
"step": 380500
},
{
"epoch": 3.5153763113460847,
"grad_norm": 1.1018445491790771,
"learning_rate": 3.2423164576816975e-05,
"loss": 0.0426,
"num_input_tokens_seen": 390141696,
"step": 381000
},
{
"epoch": 3.519989666085384,
"grad_norm": 5.711886882781982,
"learning_rate": 3.2400097803120476e-05,
"loss": 0.0362,
"num_input_tokens_seen": 390653696,
"step": 381500
},
{
"epoch": 3.5246030208246832,
"grad_norm": 5.521966934204102,
"learning_rate": 3.2377031029423976e-05,
"loss": 0.0445,
"num_input_tokens_seen": 391165696,
"step": 382000
},
{
"epoch": 3.5292163755639825,
"grad_norm": 1.7097331285476685,
"learning_rate": 3.2353964255727484e-05,
"loss": 0.0394,
"num_input_tokens_seen": 391677696,
"step": 382500
},
{
"epoch": 3.5338297303032817,
"grad_norm": 2.794013023376465,
"learning_rate": 3.2330897482030984e-05,
"loss": 0.0418,
"num_input_tokens_seen": 392189696,
"step": 383000
},
{
"epoch": 3.5384430850425814,
"grad_norm": 0.8009048700332642,
"learning_rate": 3.2307830708334485e-05,
"loss": 0.0402,
"num_input_tokens_seen": 392701696,
"step": 383500
},
{
"epoch": 3.5430564397818807,
"grad_norm": 1.5974643230438232,
"learning_rate": 3.228476393463799e-05,
"loss": 0.0403,
"num_input_tokens_seen": 393213696,
"step": 384000
},
{
"epoch": 3.54766979452118,
"grad_norm": 2.538250207901001,
"learning_rate": 3.226169716094149e-05,
"loss": 0.0401,
"num_input_tokens_seen": 393725696,
"step": 384500
},
{
"epoch": 3.552283149260479,
"grad_norm": 1.2976337671279907,
"learning_rate": 3.2238630387245e-05,
"loss": 0.0379,
"num_input_tokens_seen": 394237696,
"step": 385000
},
{
"epoch": 3.5568965039997784,
"grad_norm": 1.1865109205245972,
"learning_rate": 3.22155636135485e-05,
"loss": 0.04,
"num_input_tokens_seen": 394749696,
"step": 385500
},
{
"epoch": 3.561509858739078,
"grad_norm": 0.36470434069633484,
"learning_rate": 3.2192496839852e-05,
"loss": 0.0399,
"num_input_tokens_seen": 395261696,
"step": 386000
},
{
"epoch": 3.5661232134783774,
"grad_norm": 2.1635212898254395,
"learning_rate": 3.216943006615551e-05,
"loss": 0.0403,
"num_input_tokens_seen": 395773696,
"step": 386500
},
{
"epoch": 3.5707365682176766,
"grad_norm": 1.7805256843566895,
"learning_rate": 3.2146363292459016e-05,
"loss": 0.0391,
"num_input_tokens_seen": 396285696,
"step": 387000
},
{
"epoch": 3.575349922956976,
"grad_norm": 1.5320919752120972,
"learning_rate": 3.212329651876252e-05,
"loss": 0.0417,
"num_input_tokens_seen": 396797696,
"step": 387500
},
{
"epoch": 3.579963277696275,
"grad_norm": 3.523890733718872,
"learning_rate": 3.210022974506602e-05,
"loss": 0.0394,
"num_input_tokens_seen": 397309696,
"step": 388000
},
{
"epoch": 3.5845766324355743,
"grad_norm": 1.2910226583480835,
"learning_rate": 3.2077162971369525e-05,
"loss": 0.0397,
"num_input_tokens_seen": 397821696,
"step": 388500
},
{
"epoch": 3.5891899871748736,
"grad_norm": 1.5501660108566284,
"learning_rate": 3.2054096197673025e-05,
"loss": 0.0396,
"num_input_tokens_seen": 398333696,
"step": 389000
},
{
"epoch": 3.5938033419141733,
"grad_norm": 1.1182091236114502,
"learning_rate": 3.2031029423976526e-05,
"loss": 0.0421,
"num_input_tokens_seen": 398845696,
"step": 389500
},
{
"epoch": 3.5984166966534725,
"grad_norm": 1.5010899305343628,
"learning_rate": 3.200796265028003e-05,
"loss": 0.038,
"num_input_tokens_seen": 399357696,
"step": 390000
},
{
"epoch": 3.6030300513927718,
"grad_norm": 0.4965997040271759,
"learning_rate": 3.198489587658354e-05,
"loss": 0.0392,
"num_input_tokens_seen": 399869696,
"step": 390500
},
{
"epoch": 3.607643406132071,
"grad_norm": 0.735758364200592,
"learning_rate": 3.1961829102887034e-05,
"loss": 0.0375,
"num_input_tokens_seen": 400381696,
"step": 391000
},
{
"epoch": 3.6122567608713707,
"grad_norm": 0.9119324684143066,
"learning_rate": 3.193876232919054e-05,
"loss": 0.0397,
"num_input_tokens_seen": 400893696,
"step": 391500
},
{
"epoch": 3.61687011561067,
"grad_norm": 1.0355151891708374,
"learning_rate": 3.191569555549405e-05,
"loss": 0.0379,
"num_input_tokens_seen": 401405696,
"step": 392000
},
{
"epoch": 3.621483470349969,
"grad_norm": 1.574038028717041,
"learning_rate": 3.189262878179755e-05,
"loss": 0.0398,
"num_input_tokens_seen": 401917696,
"step": 392500
},
{
"epoch": 3.6260968250892684,
"grad_norm": 1.9339407682418823,
"learning_rate": 3.186956200810105e-05,
"loss": 0.0366,
"num_input_tokens_seen": 402429696,
"step": 393000
},
{
"epoch": 3.6307101798285677,
"grad_norm": 1.808971643447876,
"learning_rate": 3.184649523440456e-05,
"loss": 0.0433,
"num_input_tokens_seen": 402941696,
"step": 393500
},
{
"epoch": 3.635323534567867,
"grad_norm": 0.8877146244049072,
"learning_rate": 3.182342846070806e-05,
"loss": 0.0402,
"num_input_tokens_seen": 403453696,
"step": 394000
},
{
"epoch": 3.639936889307166,
"grad_norm": 1.4622044563293457,
"learning_rate": 3.1800361687011566e-05,
"loss": 0.0429,
"num_input_tokens_seen": 403965696,
"step": 394500
},
{
"epoch": 3.6445502440464654,
"grad_norm": 1.1509592533111572,
"learning_rate": 3.1777294913315066e-05,
"loss": 0.0378,
"num_input_tokens_seen": 404477696,
"step": 395000
},
{
"epoch": 3.649163598785765,
"grad_norm": 1.6934188604354858,
"learning_rate": 3.175422813961857e-05,
"loss": 0.0395,
"num_input_tokens_seen": 404989696,
"step": 395500
},
{
"epoch": 3.6537769535250644,
"grad_norm": 2.861666202545166,
"learning_rate": 3.1731161365922074e-05,
"loss": 0.0382,
"num_input_tokens_seen": 405501696,
"step": 396000
},
{
"epoch": 3.6583903082643636,
"grad_norm": 1.3087468147277832,
"learning_rate": 3.1708094592225575e-05,
"loss": 0.0387,
"num_input_tokens_seen": 406013696,
"step": 396500
},
{
"epoch": 3.663003663003663,
"grad_norm": 0.8184057474136353,
"learning_rate": 3.1685027818529075e-05,
"loss": 0.0436,
"num_input_tokens_seen": 406525696,
"step": 397000
},
{
"epoch": 3.6676170177429626,
"grad_norm": 1.3447506427764893,
"learning_rate": 3.166196104483258e-05,
"loss": 0.0387,
"num_input_tokens_seen": 407037696,
"step": 397500
},
{
"epoch": 3.672230372482262,
"grad_norm": 1.8640304803848267,
"learning_rate": 3.163889427113609e-05,
"loss": 0.0427,
"num_input_tokens_seen": 407549696,
"step": 398000
},
{
"epoch": 3.676843727221561,
"grad_norm": 6.683871746063232,
"learning_rate": 3.161582749743959e-05,
"loss": 0.0413,
"num_input_tokens_seen": 408061696,
"step": 398500
},
{
"epoch": 3.6814570819608603,
"grad_norm": 0.6029996275901794,
"learning_rate": 3.159276072374309e-05,
"loss": 0.0428,
"num_input_tokens_seen": 408573696,
"step": 399000
},
{
"epoch": 3.6860704367001595,
"grad_norm": 0.6650155782699585,
"learning_rate": 3.15696939500466e-05,
"loss": 0.0376,
"num_input_tokens_seen": 409085696,
"step": 399500
},
{
"epoch": 3.690683791439459,
"grad_norm": 0.6915871500968933,
"learning_rate": 3.15466271763501e-05,
"loss": 0.04,
"num_input_tokens_seen": 409597696,
"step": 400000
},
{
"epoch": 3.695297146178758,
"grad_norm": 0.9651739597320557,
"learning_rate": 3.15235604026536e-05,
"loss": 0.0388,
"num_input_tokens_seen": 410109696,
"step": 400500
},
{
"epoch": 3.6999105009180577,
"grad_norm": 1.2852321863174438,
"learning_rate": 3.150049362895711e-05,
"loss": 0.0436,
"num_input_tokens_seen": 410621696,
"step": 401000
},
{
"epoch": 3.704523855657357,
"grad_norm": 1.250339150428772,
"learning_rate": 3.1477426855260614e-05,
"loss": 0.0371,
"num_input_tokens_seen": 411133696,
"step": 401500
},
{
"epoch": 3.7091372103966562,
"grad_norm": 0.9992502927780151,
"learning_rate": 3.1454360081564115e-05,
"loss": 0.0413,
"num_input_tokens_seen": 411645696,
"step": 402000
},
{
"epoch": 3.7137505651359555,
"grad_norm": 3.6451685428619385,
"learning_rate": 3.1431293307867615e-05,
"loss": 0.0425,
"num_input_tokens_seen": 412157696,
"step": 402500
},
{
"epoch": 3.718363919875255,
"grad_norm": 0.49393585324287415,
"learning_rate": 3.140822653417112e-05,
"loss": 0.0414,
"num_input_tokens_seen": 412669696,
"step": 403000
},
{
"epoch": 3.7229772746145544,
"grad_norm": 1.5764920711517334,
"learning_rate": 3.1385159760474623e-05,
"loss": 0.0373,
"num_input_tokens_seen": 413181696,
"step": 403500
},
{
"epoch": 3.7275906293538537,
"grad_norm": 2.7465178966522217,
"learning_rate": 3.1362092986778124e-05,
"loss": 0.0418,
"num_input_tokens_seen": 413693696,
"step": 404000
},
{
"epoch": 3.732203984093153,
"grad_norm": 2.4784648418426514,
"learning_rate": 3.133902621308163e-05,
"loss": 0.0373,
"num_input_tokens_seen": 414205696,
"step": 404500
},
{
"epoch": 3.736817338832452,
"grad_norm": 1.1435418128967285,
"learning_rate": 3.131595943938513e-05,
"loss": 0.0393,
"num_input_tokens_seen": 414717696,
"step": 405000
},
{
"epoch": 3.7414306935717514,
"grad_norm": 3.1641488075256348,
"learning_rate": 3.129289266568864e-05,
"loss": 0.0378,
"num_input_tokens_seen": 415229696,
"step": 405500
},
{
"epoch": 3.7460440483110506,
"grad_norm": 1.299619436264038,
"learning_rate": 3.126982589199214e-05,
"loss": 0.0376,
"num_input_tokens_seen": 415741696,
"step": 406000
},
{
"epoch": 3.75065740305035,
"grad_norm": 1.7014168500900269,
"learning_rate": 3.124675911829564e-05,
"loss": 0.0448,
"num_input_tokens_seen": 416253696,
"step": 406500
},
{
"epoch": 3.7552707577896496,
"grad_norm": 1.5592892169952393,
"learning_rate": 3.122369234459915e-05,
"loss": 0.038,
"num_input_tokens_seen": 416765696,
"step": 407000
},
{
"epoch": 3.759884112528949,
"grad_norm": 0.6049352884292603,
"learning_rate": 3.1200625570902655e-05,
"loss": 0.039,
"num_input_tokens_seen": 417277696,
"step": 407500
},
{
"epoch": 3.764497467268248,
"grad_norm": 0.6392286419868469,
"learning_rate": 3.117755879720615e-05,
"loss": 0.04,
"num_input_tokens_seen": 417789696,
"step": 408000
},
{
"epoch": 3.7691108220075473,
"grad_norm": 3.689347505569458,
"learning_rate": 3.1154492023509656e-05,
"loss": 0.0385,
"num_input_tokens_seen": 418301696,
"step": 408500
},
{
"epoch": 3.773724176746847,
"grad_norm": 0.8414890766143799,
"learning_rate": 3.1131425249813164e-05,
"loss": 0.0366,
"num_input_tokens_seen": 418813696,
"step": 409000
},
{
"epoch": 3.7783375314861463,
"grad_norm": 5.263124465942383,
"learning_rate": 3.1108358476116664e-05,
"loss": 0.0406,
"num_input_tokens_seen": 419325696,
"step": 409500
},
{
"epoch": 3.7829508862254455,
"grad_norm": 1.395107626914978,
"learning_rate": 3.1085291702420165e-05,
"loss": 0.0375,
"num_input_tokens_seen": 419837696,
"step": 410000
},
{
"epoch": 3.7875642409647448,
"grad_norm": 1.189859390258789,
"learning_rate": 3.106222492872367e-05,
"loss": 0.0373,
"num_input_tokens_seen": 420349696,
"step": 410500
},
{
"epoch": 3.792177595704044,
"grad_norm": 0.5523993372917175,
"learning_rate": 3.103915815502717e-05,
"loss": 0.0386,
"num_input_tokens_seen": 420861696,
"step": 411000
},
{
"epoch": 3.7967909504433432,
"grad_norm": 0.6239033341407776,
"learning_rate": 3.1016091381330673e-05,
"loss": 0.0369,
"num_input_tokens_seen": 421373696,
"step": 411500
},
{
"epoch": 3.8014043051826425,
"grad_norm": 2.072326421737671,
"learning_rate": 3.099302460763418e-05,
"loss": 0.0435,
"num_input_tokens_seen": 421885696,
"step": 412000
},
{
"epoch": 3.806017659921942,
"grad_norm": 2.074704647064209,
"learning_rate": 3.096995783393769e-05,
"loss": 0.04,
"num_input_tokens_seen": 422397696,
"step": 412500
},
{
"epoch": 3.8106310146612414,
"grad_norm": 1.9311884641647339,
"learning_rate": 3.094689106024119e-05,
"loss": 0.0428,
"num_input_tokens_seen": 422909696,
"step": 413000
},
{
"epoch": 3.8152443694005407,
"grad_norm": 1.3210355043411255,
"learning_rate": 3.092382428654469e-05,
"loss": 0.0429,
"num_input_tokens_seen": 423421696,
"step": 413500
},
{
"epoch": 3.81985772413984,
"grad_norm": 3.048222064971924,
"learning_rate": 3.09007575128482e-05,
"loss": 0.0418,
"num_input_tokens_seen": 423933696,
"step": 414000
},
{
"epoch": 3.824471078879139,
"grad_norm": 0.8300300240516663,
"learning_rate": 3.08776907391517e-05,
"loss": 0.0408,
"num_input_tokens_seen": 424445696,
"step": 414500
},
{
"epoch": 3.829084433618439,
"grad_norm": 0.6099697947502136,
"learning_rate": 3.0854623965455205e-05,
"loss": 0.0453,
"num_input_tokens_seen": 424957696,
"step": 415000
},
{
"epoch": 3.833697788357738,
"grad_norm": 1.205819845199585,
"learning_rate": 3.0831557191758705e-05,
"loss": 0.0379,
"num_input_tokens_seen": 425469696,
"step": 415500
},
{
"epoch": 3.8383111430970374,
"grad_norm": 2.9948160648345947,
"learning_rate": 3.0808490418062206e-05,
"loss": 0.0406,
"num_input_tokens_seen": 425981696,
"step": 416000
},
{
"epoch": 3.8429244978363366,
"grad_norm": 1.0202473402023315,
"learning_rate": 3.078542364436571e-05,
"loss": 0.0446,
"num_input_tokens_seen": 426493696,
"step": 416500
},
{
"epoch": 3.847537852575636,
"grad_norm": 1.2540485858917236,
"learning_rate": 3.0762356870669214e-05,
"loss": 0.0431,
"num_input_tokens_seen": 427005696,
"step": 417000
},
{
"epoch": 3.852151207314935,
"grad_norm": 1.10784113407135,
"learning_rate": 3.0739290096972714e-05,
"loss": 0.0403,
"num_input_tokens_seen": 427517696,
"step": 417500
},
{
"epoch": 3.8567645620542343,
"grad_norm": 1.326798439025879,
"learning_rate": 3.071622332327622e-05,
"loss": 0.0392,
"num_input_tokens_seen": 428029696,
"step": 418000
},
{
"epoch": 3.861377916793534,
"grad_norm": 0.7203147411346436,
"learning_rate": 3.069315654957973e-05,
"loss": 0.0412,
"num_input_tokens_seen": 428541696,
"step": 418500
},
{
"epoch": 3.8659912715328333,
"grad_norm": 2.017019510269165,
"learning_rate": 3.067008977588323e-05,
"loss": 0.0397,
"num_input_tokens_seen": 429053696,
"step": 419000
},
{
"epoch": 3.8706046262721325,
"grad_norm": 1.9709299802780151,
"learning_rate": 3.064702300218673e-05,
"loss": 0.0382,
"num_input_tokens_seen": 429565696,
"step": 419500
},
{
"epoch": 3.875217981011432,
"grad_norm": 3.0947420597076416,
"learning_rate": 3.062395622849024e-05,
"loss": 0.037,
"num_input_tokens_seen": 430077696,
"step": 420000
},
{
"epoch": 3.8798313357507315,
"grad_norm": 1.6916519403457642,
"learning_rate": 3.060088945479374e-05,
"loss": 0.038,
"num_input_tokens_seen": 430589696,
"step": 420500
},
{
"epoch": 3.8844446904900307,
"grad_norm": 2.846257209777832,
"learning_rate": 3.057782268109724e-05,
"loss": 0.0415,
"num_input_tokens_seen": 431101696,
"step": 421000
},
{
"epoch": 3.88905804522933,
"grad_norm": 0.8271204233169556,
"learning_rate": 3.0554755907400746e-05,
"loss": 0.0428,
"num_input_tokens_seen": 431613696,
"step": 421500
},
{
"epoch": 3.893671399968629,
"grad_norm": 1.4244275093078613,
"learning_rate": 3.0531689133704247e-05,
"loss": 0.042,
"num_input_tokens_seen": 432125696,
"step": 422000
},
{
"epoch": 3.8982847547079285,
"grad_norm": 1.629799485206604,
"learning_rate": 3.050862236000775e-05,
"loss": 0.038,
"num_input_tokens_seen": 432637696,
"step": 422500
},
{
"epoch": 3.9028981094472277,
"grad_norm": 1.1674317121505737,
"learning_rate": 3.0485555586311255e-05,
"loss": 0.0408,
"num_input_tokens_seen": 433149696,
"step": 423000
},
{
"epoch": 3.907511464186527,
"grad_norm": 0.816435694694519,
"learning_rate": 3.046248881261476e-05,
"loss": 0.0395,
"num_input_tokens_seen": 433661696,
"step": 423500
},
{
"epoch": 3.9121248189258266,
"grad_norm": 0.8461304903030396,
"learning_rate": 3.0439422038918262e-05,
"loss": 0.0414,
"num_input_tokens_seen": 434173696,
"step": 424000
},
{
"epoch": 3.916738173665126,
"grad_norm": 1.0469881296157837,
"learning_rate": 3.0416355265221763e-05,
"loss": 0.0403,
"num_input_tokens_seen": 434685696,
"step": 424500
},
{
"epoch": 3.921351528404425,
"grad_norm": 2.0151569843292236,
"learning_rate": 3.0393288491525267e-05,
"loss": 0.0411,
"num_input_tokens_seen": 435197696,
"step": 425000
},
{
"epoch": 3.9259648831437244,
"grad_norm": 1.178753137588501,
"learning_rate": 3.0370221717828774e-05,
"loss": 0.0415,
"num_input_tokens_seen": 435709696,
"step": 425500
},
{
"epoch": 3.9305782378830236,
"grad_norm": 0.6420595049858093,
"learning_rate": 3.034715494413228e-05,
"loss": 0.0433,
"num_input_tokens_seen": 436221696,
"step": 426000
},
{
"epoch": 3.9351915926223233,
"grad_norm": 1.1695127487182617,
"learning_rate": 3.0324088170435776e-05,
"loss": 0.0415,
"num_input_tokens_seen": 436733696,
"step": 426500
},
{
"epoch": 3.9398049473616226,
"grad_norm": 0.9923868179321289,
"learning_rate": 3.0301021396739283e-05,
"loss": 0.0412,
"num_input_tokens_seen": 437245696,
"step": 427000
},
{
"epoch": 3.944418302100922,
"grad_norm": 0.8079075217247009,
"learning_rate": 3.0277954623042787e-05,
"loss": 0.0401,
"num_input_tokens_seen": 437757696,
"step": 427500
},
{
"epoch": 3.949031656840221,
"grad_norm": 2.699918746948242,
"learning_rate": 3.025488784934629e-05,
"loss": 0.04,
"num_input_tokens_seen": 438269696,
"step": 428000
},
{
"epoch": 3.9536450115795203,
"grad_norm": 0.577458381652832,
"learning_rate": 3.023182107564979e-05,
"loss": 0.0404,
"num_input_tokens_seen": 438781696,
"step": 428500
},
{
"epoch": 3.9582583663188196,
"grad_norm": 0.6960185170173645,
"learning_rate": 3.0208754301953295e-05,
"loss": 0.0393,
"num_input_tokens_seen": 439293696,
"step": 429000
},
{
"epoch": 3.962871721058119,
"grad_norm": 1.2610116004943848,
"learning_rate": 3.01856875282568e-05,
"loss": 0.0385,
"num_input_tokens_seen": 439805696,
"step": 429500
},
{
"epoch": 3.9674850757974185,
"grad_norm": 1.0515618324279785,
"learning_rate": 3.01626207545603e-05,
"loss": 0.0386,
"num_input_tokens_seen": 440317696,
"step": 430000
},
{
"epoch": 3.9720984305367177,
"grad_norm": 0.9695286154747009,
"learning_rate": 3.0139553980863804e-05,
"loss": 0.0425,
"num_input_tokens_seen": 440829696,
"step": 430500
},
{
"epoch": 3.976711785276017,
"grad_norm": 1.542039155960083,
"learning_rate": 3.011648720716731e-05,
"loss": 0.0392,
"num_input_tokens_seen": 441341696,
"step": 431000
},
{
"epoch": 3.9813251400153162,
"grad_norm": 1.2009466886520386,
"learning_rate": 3.0093420433470815e-05,
"loss": 0.043,
"num_input_tokens_seen": 441853696,
"step": 431500
},
{
"epoch": 3.985938494754616,
"grad_norm": 1.8694528341293335,
"learning_rate": 3.0070353659774312e-05,
"loss": 0.0396,
"num_input_tokens_seen": 442365696,
"step": 432000
},
{
"epoch": 3.990551849493915,
"grad_norm": 1.2931849956512451,
"learning_rate": 3.004728688607782e-05,
"loss": 0.0382,
"num_input_tokens_seen": 442877696,
"step": 432500
},
{
"epoch": 3.9951652042332144,
"grad_norm": 0.953074038028717,
"learning_rate": 3.0024220112381324e-05,
"loss": 0.0429,
"num_input_tokens_seen": 443389696,
"step": 433000
},
{
"epoch": 3.9997785589725137,
"grad_norm": 2.807677745819092,
"learning_rate": 3.0001153338684828e-05,
"loss": 0.0387,
"num_input_tokens_seen": 443901696,
"step": 433500
},
{
"epoch": 4.0,
"eval_combined_score": 0.06748922723897993,
"eval_loss": 0.06748922914266586,
"eval_mse": 0.06748922533529399,
"eval_runtime": 49.5025,
"eval_samples_per_second": 1946.123,
"eval_steps_per_second": 243.281,
"num_input_tokens_seen": 443925504,
"step": 433524
},
{
"epoch": 4.004391913711813,
"grad_norm": 0.2404492050409317,
"learning_rate": 2.997808656498833e-05,
"loss": 0.0308,
"num_input_tokens_seen": 444412928,
"step": 434000
},
{
"epoch": 4.009005268451112,
"grad_norm": 1.2364345788955688,
"learning_rate": 2.9955019791291832e-05,
"loss": 0.0297,
"num_input_tokens_seen": 444924928,
"step": 434500
},
{
"epoch": 4.013618623190411,
"grad_norm": 0.9113791584968567,
"learning_rate": 2.9931953017595336e-05,
"loss": 0.0287,
"num_input_tokens_seen": 445436928,
"step": 435000
},
{
"epoch": 4.018231977929711,
"grad_norm": 1.880218267440796,
"learning_rate": 2.9908886243898837e-05,
"loss": 0.0294,
"num_input_tokens_seen": 445948928,
"step": 435500
},
{
"epoch": 4.02284533266901,
"grad_norm": 1.7842798233032227,
"learning_rate": 2.988581947020234e-05,
"loss": 0.0288,
"num_input_tokens_seen": 446460928,
"step": 436000
},
{
"epoch": 4.027458687408309,
"grad_norm": 0.5358702540397644,
"learning_rate": 2.9862752696505848e-05,
"loss": 0.0305,
"num_input_tokens_seen": 446972928,
"step": 436500
},
{
"epoch": 4.032072042147609,
"grad_norm": 0.7529350519180298,
"learning_rate": 2.9839685922809352e-05,
"loss": 0.029,
"num_input_tokens_seen": 447484928,
"step": 437000
},
{
"epoch": 4.0366853968869085,
"grad_norm": 0.6187124848365784,
"learning_rate": 2.981661914911285e-05,
"loss": 0.0303,
"num_input_tokens_seen": 447996928,
"step": 437500
},
{
"epoch": 4.041298751626208,
"grad_norm": 1.1267274618148804,
"learning_rate": 2.9793552375416357e-05,
"loss": 0.0292,
"num_input_tokens_seen": 448508928,
"step": 438000
},
{
"epoch": 4.045912106365507,
"grad_norm": 1.6049976348876953,
"learning_rate": 2.977048560171986e-05,
"loss": 0.0292,
"num_input_tokens_seen": 449020928,
"step": 438500
},
{
"epoch": 4.050525461104806,
"grad_norm": 3.9203622341156006,
"learning_rate": 2.9747418828023365e-05,
"loss": 0.0312,
"num_input_tokens_seen": 449532928,
"step": 439000
},
{
"epoch": 4.0551388158441055,
"grad_norm": 0.6487706899642944,
"learning_rate": 2.9724352054326865e-05,
"loss": 0.029,
"num_input_tokens_seen": 450044928,
"step": 439500
},
{
"epoch": 4.059752170583405,
"grad_norm": 0.9871296882629395,
"learning_rate": 2.970128528063037e-05,
"loss": 0.0299,
"num_input_tokens_seen": 450556928,
"step": 440000
},
{
"epoch": 4.064365525322704,
"grad_norm": 0.4027337431907654,
"learning_rate": 2.9678218506933873e-05,
"loss": 0.0287,
"num_input_tokens_seen": 451068928,
"step": 440500
},
{
"epoch": 4.068978880062003,
"grad_norm": 1.1440553665161133,
"learning_rate": 2.965515173323738e-05,
"loss": 0.0313,
"num_input_tokens_seen": 451580928,
"step": 441000
},
{
"epoch": 4.0735922348013025,
"grad_norm": 0.5619149208068848,
"learning_rate": 2.9632084959540878e-05,
"loss": 0.0334,
"num_input_tokens_seen": 452092928,
"step": 441500
},
{
"epoch": 4.078205589540602,
"grad_norm": 3.5681047439575195,
"learning_rate": 2.9609018185844385e-05,
"loss": 0.0301,
"num_input_tokens_seen": 452604928,
"step": 442000
},
{
"epoch": 4.082818944279902,
"grad_norm": 1.2567273378372192,
"learning_rate": 2.958595141214789e-05,
"loss": 0.0317,
"num_input_tokens_seen": 453116928,
"step": 442500
},
{
"epoch": 4.087432299019201,
"grad_norm": 1.553036093711853,
"learning_rate": 2.956288463845139e-05,
"loss": 0.0296,
"num_input_tokens_seen": 453628928,
"step": 443000
},
{
"epoch": 4.0920456537585,
"grad_norm": 0.8509573340415955,
"learning_rate": 2.9539817864754894e-05,
"loss": 0.0325,
"num_input_tokens_seen": 454140928,
"step": 443500
},
{
"epoch": 4.0966590084978,
"grad_norm": 1.0355197191238403,
"learning_rate": 2.9516751091058398e-05,
"loss": 0.0346,
"num_input_tokens_seen": 454652928,
"step": 444000
},
{
"epoch": 4.101272363237099,
"grad_norm": 1.49540376663208,
"learning_rate": 2.94936843173619e-05,
"loss": 0.0335,
"num_input_tokens_seen": 455164928,
"step": 444500
},
{
"epoch": 4.105885717976398,
"grad_norm": 1.6079996824264526,
"learning_rate": 2.9470617543665402e-05,
"loss": 0.0311,
"num_input_tokens_seen": 455676928,
"step": 445000
},
{
"epoch": 4.110499072715697,
"grad_norm": 0.5073397159576416,
"learning_rate": 2.9447550769968906e-05,
"loss": 0.0308,
"num_input_tokens_seen": 456188928,
"step": 445500
},
{
"epoch": 4.115112427454997,
"grad_norm": 1.6608948707580566,
"learning_rate": 2.942448399627241e-05,
"loss": 0.0302,
"num_input_tokens_seen": 456700928,
"step": 446000
},
{
"epoch": 4.119725782194296,
"grad_norm": 0.9647392630577087,
"learning_rate": 2.9401417222575917e-05,
"loss": 0.0311,
"num_input_tokens_seen": 457212928,
"step": 446500
},
{
"epoch": 4.124339136933595,
"grad_norm": 0.6390677690505981,
"learning_rate": 2.9378350448879415e-05,
"loss": 0.0305,
"num_input_tokens_seen": 457724928,
"step": 447000
},
{
"epoch": 4.128952491672894,
"grad_norm": 1.7215697765350342,
"learning_rate": 2.9355283675182922e-05,
"loss": 0.0328,
"num_input_tokens_seen": 458236928,
"step": 447500
},
{
"epoch": 4.133565846412194,
"grad_norm": 1.1551854610443115,
"learning_rate": 2.9332216901486426e-05,
"loss": 0.0313,
"num_input_tokens_seen": 458748928,
"step": 448000
},
{
"epoch": 4.138179201151494,
"grad_norm": 1.6345293521881104,
"learning_rate": 2.9309150127789927e-05,
"loss": 0.0311,
"num_input_tokens_seen": 459260928,
"step": 448500
},
{
"epoch": 4.142792555890793,
"grad_norm": 1.5224887132644653,
"learning_rate": 2.928608335409343e-05,
"loss": 0.0307,
"num_input_tokens_seen": 459772928,
"step": 449000
},
{
"epoch": 4.147405910630092,
"grad_norm": 1.6716899871826172,
"learning_rate": 2.9263016580396934e-05,
"loss": 0.0346,
"num_input_tokens_seen": 460284928,
"step": 449500
},
{
"epoch": 4.1520192653693915,
"grad_norm": 2.299623489379883,
"learning_rate": 2.923994980670044e-05,
"loss": 0.0301,
"num_input_tokens_seen": 460796928,
"step": 450000
},
{
"epoch": 4.156632620108691,
"grad_norm": 0.7651464343070984,
"learning_rate": 2.921688303300394e-05,
"loss": 0.0308,
"num_input_tokens_seen": 461308928,
"step": 450500
},
{
"epoch": 4.16124597484799,
"grad_norm": 1.1913387775421143,
"learning_rate": 2.9193816259307443e-05,
"loss": 0.0312,
"num_input_tokens_seen": 461820928,
"step": 451000
},
{
"epoch": 4.165859329587289,
"grad_norm": 1.0334786176681519,
"learning_rate": 2.9170749485610947e-05,
"loss": 0.0335,
"num_input_tokens_seen": 462332928,
"step": 451500
},
{
"epoch": 4.1704726843265885,
"grad_norm": 1.9780852794647217,
"learning_rate": 2.9147682711914454e-05,
"loss": 0.0344,
"num_input_tokens_seen": 462844928,
"step": 452000
},
{
"epoch": 4.175086039065888,
"grad_norm": 0.8200696706771851,
"learning_rate": 2.912461593821795e-05,
"loss": 0.033,
"num_input_tokens_seen": 463356928,
"step": 452500
},
{
"epoch": 4.179699393805187,
"grad_norm": 1.0019230842590332,
"learning_rate": 2.910154916452146e-05,
"loss": 0.0303,
"num_input_tokens_seen": 463868928,
"step": 453000
},
{
"epoch": 4.184312748544486,
"grad_norm": 2.18719744682312,
"learning_rate": 2.9078482390824963e-05,
"loss": 0.03,
"num_input_tokens_seen": 464380928,
"step": 453500
},
{
"epoch": 4.1889261032837855,
"grad_norm": 1.2453852891921997,
"learning_rate": 2.9055415617128467e-05,
"loss": 0.0306,
"num_input_tokens_seen": 464892928,
"step": 454000
},
{
"epoch": 4.193539458023086,
"grad_norm": 2.0544652938842773,
"learning_rate": 2.9032348843431967e-05,
"loss": 0.0331,
"num_input_tokens_seen": 465404928,
"step": 454500
},
{
"epoch": 4.198152812762385,
"grad_norm": 5.509039878845215,
"learning_rate": 2.900928206973547e-05,
"loss": 0.0308,
"num_input_tokens_seen": 465916928,
"step": 455000
},
{
"epoch": 4.202766167501684,
"grad_norm": 0.6365485787391663,
"learning_rate": 2.8986215296038975e-05,
"loss": 0.0322,
"num_input_tokens_seen": 466428928,
"step": 455500
},
{
"epoch": 4.207379522240983,
"grad_norm": 0.8369764685630798,
"learning_rate": 2.8963148522342476e-05,
"loss": 0.0311,
"num_input_tokens_seen": 466940928,
"step": 456000
},
{
"epoch": 4.211992876980283,
"grad_norm": 1.3454687595367432,
"learning_rate": 2.894008174864598e-05,
"loss": 0.0317,
"num_input_tokens_seen": 467452928,
"step": 456500
},
{
"epoch": 4.216606231719582,
"grad_norm": 1.042900800704956,
"learning_rate": 2.8917014974949487e-05,
"loss": 0.0304,
"num_input_tokens_seen": 467964928,
"step": 457000
},
{
"epoch": 4.221219586458881,
"grad_norm": 2.2044434547424316,
"learning_rate": 2.889394820125299e-05,
"loss": 0.0309,
"num_input_tokens_seen": 468476928,
"step": 457500
},
{
"epoch": 4.22583294119818,
"grad_norm": 1.4156602621078491,
"learning_rate": 2.887088142755649e-05,
"loss": 0.0325,
"num_input_tokens_seen": 468988928,
"step": 458000
},
{
"epoch": 4.23044629593748,
"grad_norm": 1.4290229082107544,
"learning_rate": 2.8847814653859996e-05,
"loss": 0.034,
"num_input_tokens_seen": 469500928,
"step": 458500
},
{
"epoch": 4.235059650676779,
"grad_norm": 0.8856704235076904,
"learning_rate": 2.88247478801635e-05,
"loss": 0.0301,
"num_input_tokens_seen": 470012928,
"step": 459000
},
{
"epoch": 4.239673005416078,
"grad_norm": 1.0637128353118896,
"learning_rate": 2.8801681106467004e-05,
"loss": 0.0315,
"num_input_tokens_seen": 470524928,
"step": 459500
},
{
"epoch": 4.244286360155378,
"grad_norm": 0.9506544470787048,
"learning_rate": 2.8778614332770504e-05,
"loss": 0.0298,
"num_input_tokens_seen": 471036928,
"step": 460000
},
{
"epoch": 4.248899714894677,
"grad_norm": 1.05034339427948,
"learning_rate": 2.8755547559074008e-05,
"loss": 0.0331,
"num_input_tokens_seen": 471548928,
"step": 460500
},
{
"epoch": 4.253513069633977,
"grad_norm": 1.1537014245986938,
"learning_rate": 2.8732480785377512e-05,
"loss": 0.0309,
"num_input_tokens_seen": 472060928,
"step": 461000
},
{
"epoch": 4.258126424373276,
"grad_norm": 0.42139768600463867,
"learning_rate": 2.8709414011681013e-05,
"loss": 0.032,
"num_input_tokens_seen": 472572928,
"step": 461500
},
{
"epoch": 4.262739779112575,
"grad_norm": 2.2188069820404053,
"learning_rate": 2.8686347237984517e-05,
"loss": 0.0301,
"num_input_tokens_seen": 473084928,
"step": 462000
},
{
"epoch": 4.267353133851874,
"grad_norm": 1.293926477432251,
"learning_rate": 2.8663280464288024e-05,
"loss": 0.0323,
"num_input_tokens_seen": 473596928,
"step": 462500
},
{
"epoch": 4.271966488591174,
"grad_norm": 1.7295567989349365,
"learning_rate": 2.8640213690591528e-05,
"loss": 0.0286,
"num_input_tokens_seen": 474108928,
"step": 463000
},
{
"epoch": 4.276579843330473,
"grad_norm": 1.3442994356155396,
"learning_rate": 2.8617146916895025e-05,
"loss": 0.0312,
"num_input_tokens_seen": 474620928,
"step": 463500
},
{
"epoch": 4.281193198069772,
"grad_norm": 1.4000321626663208,
"learning_rate": 2.8594080143198533e-05,
"loss": 0.0326,
"num_input_tokens_seen": 475132928,
"step": 464000
},
{
"epoch": 4.285806552809071,
"grad_norm": 1.4646140336990356,
"learning_rate": 2.8571013369502037e-05,
"loss": 0.0316,
"num_input_tokens_seen": 475644928,
"step": 464500
},
{
"epoch": 4.290419907548371,
"grad_norm": 1.296420931816101,
"learning_rate": 2.854794659580554e-05,
"loss": 0.0328,
"num_input_tokens_seen": 476156928,
"step": 465000
},
{
"epoch": 4.295033262287671,
"grad_norm": 0.947172999382019,
"learning_rate": 2.852487982210904e-05,
"loss": 0.031,
"num_input_tokens_seen": 476668928,
"step": 465500
},
{
"epoch": 4.29964661702697,
"grad_norm": 0.6631402969360352,
"learning_rate": 2.8501813048412545e-05,
"loss": 0.0291,
"num_input_tokens_seen": 477180928,
"step": 466000
},
{
"epoch": 4.304259971766269,
"grad_norm": 0.5878441333770752,
"learning_rate": 2.847874627471605e-05,
"loss": 0.0316,
"num_input_tokens_seen": 477692928,
"step": 466500
},
{
"epoch": 4.3088733265055685,
"grad_norm": 1.32041335105896,
"learning_rate": 2.8455679501019557e-05,
"loss": 0.0296,
"num_input_tokens_seen": 478204928,
"step": 467000
},
{
"epoch": 4.313486681244868,
"grad_norm": 0.7355374097824097,
"learning_rate": 2.8432612727323054e-05,
"loss": 0.0322,
"num_input_tokens_seen": 478716928,
"step": 467500
},
{
"epoch": 4.318100035984167,
"grad_norm": 0.5715786218643188,
"learning_rate": 2.840954595362656e-05,
"loss": 0.0345,
"num_input_tokens_seen": 479228928,
"step": 468000
},
{
"epoch": 4.322713390723466,
"grad_norm": 0.873299777507782,
"learning_rate": 2.8386479179930065e-05,
"loss": 0.0341,
"num_input_tokens_seen": 479740928,
"step": 468500
},
{
"epoch": 4.3273267454627655,
"grad_norm": 0.4993022382259369,
"learning_rate": 2.8363412406233562e-05,
"loss": 0.0347,
"num_input_tokens_seen": 480252928,
"step": 469000
},
{
"epoch": 4.331940100202065,
"grad_norm": 1.0970638990402222,
"learning_rate": 2.834034563253707e-05,
"loss": 0.0297,
"num_input_tokens_seen": 480764928,
"step": 469500
},
{
"epoch": 4.336553454941364,
"grad_norm": 1.030454158782959,
"learning_rate": 2.8317278858840574e-05,
"loss": 0.0309,
"num_input_tokens_seen": 481276928,
"step": 470000
},
{
"epoch": 4.341166809680663,
"grad_norm": 2.224727153778076,
"learning_rate": 2.8294212085144078e-05,
"loss": 0.0319,
"num_input_tokens_seen": 481788928,
"step": 470500
},
{
"epoch": 4.3457801644199625,
"grad_norm": 0.8922818899154663,
"learning_rate": 2.8271145311447578e-05,
"loss": 0.0324,
"num_input_tokens_seen": 482300928,
"step": 471000
},
{
"epoch": 4.350393519159263,
"grad_norm": 1.355394721031189,
"learning_rate": 2.8248078537751082e-05,
"loss": 0.0322,
"num_input_tokens_seen": 482812928,
"step": 471500
},
{
"epoch": 4.355006873898562,
"grad_norm": 1.3697582483291626,
"learning_rate": 2.8225011764054586e-05,
"loss": 0.0307,
"num_input_tokens_seen": 483324928,
"step": 472000
},
{
"epoch": 4.359620228637861,
"grad_norm": 0.8543123006820679,
"learning_rate": 2.8201944990358093e-05,
"loss": 0.0308,
"num_input_tokens_seen": 483836928,
"step": 472500
},
{
"epoch": 4.36423358337716,
"grad_norm": 1.2586286067962646,
"learning_rate": 2.817887821666159e-05,
"loss": 0.0334,
"num_input_tokens_seen": 484348928,
"step": 473000
},
{
"epoch": 4.36884693811646,
"grad_norm": 1.0295668840408325,
"learning_rate": 2.8155811442965098e-05,
"loss": 0.0324,
"num_input_tokens_seen": 484860928,
"step": 473500
},
{
"epoch": 4.373460292855759,
"grad_norm": 1.3368573188781738,
"learning_rate": 2.8132744669268602e-05,
"loss": 0.0303,
"num_input_tokens_seen": 485372928,
"step": 474000
},
{
"epoch": 4.378073647595058,
"grad_norm": 0.5129613280296326,
"learning_rate": 2.81096778955721e-05,
"loss": 0.031,
"num_input_tokens_seen": 485884928,
"step": 474500
},
{
"epoch": 4.382687002334357,
"grad_norm": 0.7094746828079224,
"learning_rate": 2.8086611121875606e-05,
"loss": 0.0305,
"num_input_tokens_seen": 486396928,
"step": 475000
},
{
"epoch": 4.387300357073657,
"grad_norm": 1.2379733324050903,
"learning_rate": 2.806354434817911e-05,
"loss": 0.035,
"num_input_tokens_seen": 486908928,
"step": 475500
},
{
"epoch": 4.391913711812956,
"grad_norm": 0.9573284387588501,
"learning_rate": 2.8040477574482614e-05,
"loss": 0.0321,
"num_input_tokens_seen": 487420928,
"step": 476000
},
{
"epoch": 4.396527066552255,
"grad_norm": 0.8460474014282227,
"learning_rate": 2.8017410800786115e-05,
"loss": 0.032,
"num_input_tokens_seen": 487932928,
"step": 476500
},
{
"epoch": 4.401140421291554,
"grad_norm": 0.5795192122459412,
"learning_rate": 2.799434402708962e-05,
"loss": 0.0331,
"num_input_tokens_seen": 488444928,
"step": 477000
},
{
"epoch": 4.4057537760308545,
"grad_norm": 2.4742841720581055,
"learning_rate": 2.7971277253393123e-05,
"loss": 0.0287,
"num_input_tokens_seen": 488956928,
"step": 477500
},
{
"epoch": 4.410367130770154,
"grad_norm": 2.2295806407928467,
"learning_rate": 2.794821047969663e-05,
"loss": 0.0343,
"num_input_tokens_seen": 489468928,
"step": 478000
},
{
"epoch": 4.414980485509453,
"grad_norm": 1.4073495864868164,
"learning_rate": 2.7925143706000128e-05,
"loss": 0.0335,
"num_input_tokens_seen": 489980928,
"step": 478500
},
{
"epoch": 4.419593840248752,
"grad_norm": 1.378461480140686,
"learning_rate": 2.7902076932303635e-05,
"loss": 0.0343,
"num_input_tokens_seen": 490492928,
"step": 479000
},
{
"epoch": 4.4242071949880515,
"grad_norm": 0.6204975247383118,
"learning_rate": 2.787901015860714e-05,
"loss": 0.0323,
"num_input_tokens_seen": 491004928,
"step": 479500
},
{
"epoch": 4.428820549727351,
"grad_norm": 1.0409677028656006,
"learning_rate": 2.7855943384910643e-05,
"loss": 0.0325,
"num_input_tokens_seen": 491516928,
"step": 480000
},
{
"epoch": 4.43343390446665,
"grad_norm": 1.2104921340942383,
"learning_rate": 2.7832876611214143e-05,
"loss": 0.0339,
"num_input_tokens_seen": 492028928,
"step": 480500
},
{
"epoch": 4.438047259205949,
"grad_norm": 2.0074825286865234,
"learning_rate": 2.7809809837517647e-05,
"loss": 0.0322,
"num_input_tokens_seen": 492540928,
"step": 481000
},
{
"epoch": 4.4426606139452485,
"grad_norm": 0.8541880249977112,
"learning_rate": 2.778674306382115e-05,
"loss": 0.0299,
"num_input_tokens_seen": 493052928,
"step": 481500
},
{
"epoch": 4.447273968684548,
"grad_norm": 2.382373332977295,
"learning_rate": 2.7763676290124652e-05,
"loss": 0.0303,
"num_input_tokens_seen": 493564928,
"step": 482000
},
{
"epoch": 4.451887323423847,
"grad_norm": 0.8820599317550659,
"learning_rate": 2.7740609516428156e-05,
"loss": 0.0306,
"num_input_tokens_seen": 494076928,
"step": 482500
},
{
"epoch": 4.456500678163147,
"grad_norm": 0.6329056620597839,
"learning_rate": 2.771754274273166e-05,
"loss": 0.0313,
"num_input_tokens_seen": 494588928,
"step": 483000
},
{
"epoch": 4.461114032902446,
"grad_norm": 0.7391223311424255,
"learning_rate": 2.7694475969035167e-05,
"loss": 0.0342,
"num_input_tokens_seen": 495100928,
"step": 483500
},
{
"epoch": 4.465727387641746,
"grad_norm": 0.6143118143081665,
"learning_rate": 2.7671409195338664e-05,
"loss": 0.0324,
"num_input_tokens_seen": 495612928,
"step": 484000
},
{
"epoch": 4.470340742381045,
"grad_norm": 2.01242733001709,
"learning_rate": 2.7648342421642172e-05,
"loss": 0.029,
"num_input_tokens_seen": 496124928,
"step": 484500
},
{
"epoch": 4.474954097120344,
"grad_norm": 0.9278964996337891,
"learning_rate": 2.7625275647945676e-05,
"loss": 0.0362,
"num_input_tokens_seen": 496636928,
"step": 485000
},
{
"epoch": 4.479567451859643,
"grad_norm": 1.0499247312545776,
"learning_rate": 2.760220887424918e-05,
"loss": 0.033,
"num_input_tokens_seen": 497148928,
"step": 485500
},
{
"epoch": 4.484180806598943,
"grad_norm": 1.7017521858215332,
"learning_rate": 2.757914210055268e-05,
"loss": 0.0304,
"num_input_tokens_seen": 497660928,
"step": 486000
},
{
"epoch": 4.488794161338242,
"grad_norm": 2.3478429317474365,
"learning_rate": 2.7556075326856184e-05,
"loss": 0.0329,
"num_input_tokens_seen": 498172928,
"step": 486500
},
{
"epoch": 4.493407516077541,
"grad_norm": 3.133190155029297,
"learning_rate": 2.7533008553159688e-05,
"loss": 0.0341,
"num_input_tokens_seen": 498684928,
"step": 487000
},
{
"epoch": 4.49802087081684,
"grad_norm": 0.5625250339508057,
"learning_rate": 2.750994177946319e-05,
"loss": 0.0328,
"num_input_tokens_seen": 499196928,
"step": 487500
},
{
"epoch": 4.50263422555614,
"grad_norm": 1.0259020328521729,
"learning_rate": 2.7486875005766693e-05,
"loss": 0.0318,
"num_input_tokens_seen": 499708928,
"step": 488000
},
{
"epoch": 4.50724758029544,
"grad_norm": 0.48490577936172485,
"learning_rate": 2.7463808232070197e-05,
"loss": 0.0335,
"num_input_tokens_seen": 500220928,
"step": 488500
},
{
"epoch": 4.511860935034738,
"grad_norm": 0.40793031454086304,
"learning_rate": 2.7440741458373704e-05,
"loss": 0.0312,
"num_input_tokens_seen": 500732928,
"step": 489000
},
{
"epoch": 4.516474289774038,
"grad_norm": 1.1319341659545898,
"learning_rate": 2.74176746846772e-05,
"loss": 0.0327,
"num_input_tokens_seen": 501244928,
"step": 489500
},
{
"epoch": 4.5210876445133374,
"grad_norm": 1.9659985303878784,
"learning_rate": 2.739460791098071e-05,
"loss": 0.0338,
"num_input_tokens_seen": 501756928,
"step": 490000
},
{
"epoch": 4.525700999252637,
"grad_norm": 0.5315821766853333,
"learning_rate": 2.7371541137284213e-05,
"loss": 0.0336,
"num_input_tokens_seen": 502268928,
"step": 490500
},
{
"epoch": 4.530314353991936,
"grad_norm": 0.47908708453178406,
"learning_rate": 2.7348474363587717e-05,
"loss": 0.0295,
"num_input_tokens_seen": 502780928,
"step": 491000
},
{
"epoch": 4.534927708731235,
"grad_norm": 0.9557788968086243,
"learning_rate": 2.7325407589891217e-05,
"loss": 0.0314,
"num_input_tokens_seen": 503292928,
"step": 491500
},
{
"epoch": 4.539541063470534,
"grad_norm": 1.229929804801941,
"learning_rate": 2.730234081619472e-05,
"loss": 0.03,
"num_input_tokens_seen": 503804928,
"step": 492000
},
{
"epoch": 4.544154418209834,
"grad_norm": 2.0131001472473145,
"learning_rate": 2.7279274042498225e-05,
"loss": 0.0334,
"num_input_tokens_seen": 504316928,
"step": 492500
},
{
"epoch": 4.548767772949133,
"grad_norm": 1.8093568086624146,
"learning_rate": 2.7256207268801732e-05,
"loss": 0.0315,
"num_input_tokens_seen": 504828928,
"step": 493000
},
{
"epoch": 4.553381127688432,
"grad_norm": 3.043375253677368,
"learning_rate": 2.723314049510523e-05,
"loss": 0.0336,
"num_input_tokens_seen": 505340928,
"step": 493500
},
{
"epoch": 4.557994482427731,
"grad_norm": 1.5375556945800781,
"learning_rate": 2.7210073721408734e-05,
"loss": 0.0334,
"num_input_tokens_seen": 505852928,
"step": 494000
},
{
"epoch": 4.562607837167031,
"grad_norm": 1.2980600595474243,
"learning_rate": 2.718700694771224e-05,
"loss": 0.0315,
"num_input_tokens_seen": 506364928,
"step": 494500
},
{
"epoch": 4.567221191906331,
"grad_norm": 1.3334441184997559,
"learning_rate": 2.7163940174015738e-05,
"loss": 0.0345,
"num_input_tokens_seen": 506876928,
"step": 495000
},
{
"epoch": 4.57183454664563,
"grad_norm": 10.070221900939941,
"learning_rate": 2.7140873400319246e-05,
"loss": 0.034,
"num_input_tokens_seen": 507388928,
"step": 495500
},
{
"epoch": 4.576447901384929,
"grad_norm": 9.152368545532227,
"learning_rate": 2.711780662662275e-05,
"loss": 0.0316,
"num_input_tokens_seen": 507900928,
"step": 496000
},
{
"epoch": 4.5810612561242285,
"grad_norm": 2.569089651107788,
"learning_rate": 2.7094739852926254e-05,
"loss": 0.0336,
"num_input_tokens_seen": 508412928,
"step": 496500
},
{
"epoch": 4.585674610863528,
"grad_norm": 0.7014693021774292,
"learning_rate": 2.7071673079229754e-05,
"loss": 0.0316,
"num_input_tokens_seen": 508924928,
"step": 497000
},
{
"epoch": 4.590287965602827,
"grad_norm": 1.182787537574768,
"learning_rate": 2.7048606305533258e-05,
"loss": 0.0315,
"num_input_tokens_seen": 509436928,
"step": 497500
},
{
"epoch": 4.594901320342126,
"grad_norm": 0.6506703495979309,
"learning_rate": 2.7025539531836762e-05,
"loss": 0.0328,
"num_input_tokens_seen": 509948928,
"step": 498000
},
{
"epoch": 4.5995146750814255,
"grad_norm": 0.5681861639022827,
"learning_rate": 2.700247275814027e-05,
"loss": 0.0343,
"num_input_tokens_seen": 510460928,
"step": 498500
},
{
"epoch": 4.604128029820725,
"grad_norm": 1.2895385026931763,
"learning_rate": 2.6979405984443767e-05,
"loss": 0.0332,
"num_input_tokens_seen": 510972928,
"step": 499000
},
{
"epoch": 4.608741384560024,
"grad_norm": 1.2549630403518677,
"learning_rate": 2.6956339210747274e-05,
"loss": 0.032,
"num_input_tokens_seen": 511484928,
"step": 499500
},
{
"epoch": 4.613354739299323,
"grad_norm": 1.486061692237854,
"learning_rate": 2.6933272437050778e-05,
"loss": 0.0331,
"num_input_tokens_seen": 511996928,
"step": 500000
},
{
"epoch": 4.617968094038623,
"grad_norm": 1.0897846221923828,
"learning_rate": 2.6910205663354275e-05,
"loss": 0.0327,
"num_input_tokens_seen": 512508928,
"step": 500500
},
{
"epoch": 4.622581448777923,
"grad_norm": 0.9600527286529541,
"learning_rate": 2.6887138889657782e-05,
"loss": 0.0326,
"num_input_tokens_seen": 513020928,
"step": 501000
},
{
"epoch": 4.627194803517222,
"grad_norm": 3.943963050842285,
"learning_rate": 2.6864072115961286e-05,
"loss": 0.0283,
"num_input_tokens_seen": 513532928,
"step": 501500
},
{
"epoch": 4.631808158256521,
"grad_norm": 1.1537055969238281,
"learning_rate": 2.684100534226479e-05,
"loss": 0.0353,
"num_input_tokens_seen": 514044928,
"step": 502000
},
{
"epoch": 4.63642151299582,
"grad_norm": 2.200751543045044,
"learning_rate": 2.681793856856829e-05,
"loss": 0.0323,
"num_input_tokens_seen": 514556928,
"step": 502500
},
{
"epoch": 4.64103486773512,
"grad_norm": 1.1844205856323242,
"learning_rate": 2.6794871794871795e-05,
"loss": 0.0308,
"num_input_tokens_seen": 515068928,
"step": 503000
},
{
"epoch": 4.645648222474419,
"grad_norm": 4.328240871429443,
"learning_rate": 2.67718050211753e-05,
"loss": 0.0337,
"num_input_tokens_seen": 515580928,
"step": 503500
},
{
"epoch": 4.650261577213718,
"grad_norm": 1.1905447244644165,
"learning_rate": 2.6748738247478806e-05,
"loss": 0.0335,
"num_input_tokens_seen": 516092928,
"step": 504000
},
{
"epoch": 4.654874931953017,
"grad_norm": 0.4069402813911438,
"learning_rate": 2.6725671473782303e-05,
"loss": 0.034,
"num_input_tokens_seen": 516604928,
"step": 504500
},
{
"epoch": 4.659488286692317,
"grad_norm": 0.7860555648803711,
"learning_rate": 2.670260470008581e-05,
"loss": 0.034,
"num_input_tokens_seen": 517116928,
"step": 505000
},
{
"epoch": 4.664101641431616,
"grad_norm": 0.5769841074943542,
"learning_rate": 2.6679537926389315e-05,
"loss": 0.033,
"num_input_tokens_seen": 517628928,
"step": 505500
},
{
"epoch": 4.668714996170916,
"grad_norm": 1.5153945684432983,
"learning_rate": 2.665647115269282e-05,
"loss": 0.031,
"num_input_tokens_seen": 518140928,
"step": 506000
},
{
"epoch": 4.673328350910215,
"grad_norm": 1.6713037490844727,
"learning_rate": 2.663340437899632e-05,
"loss": 0.037,
"num_input_tokens_seen": 518652928,
"step": 506500
},
{
"epoch": 4.6779417056495145,
"grad_norm": 1.2307850122451782,
"learning_rate": 2.6610337605299823e-05,
"loss": 0.0318,
"num_input_tokens_seen": 519164928,
"step": 507000
},
{
"epoch": 4.682555060388814,
"grad_norm": 1.2771391868591309,
"learning_rate": 2.6587270831603327e-05,
"loss": 0.0292,
"num_input_tokens_seen": 519676928,
"step": 507500
},
{
"epoch": 4.687168415128113,
"grad_norm": 1.468724012374878,
"learning_rate": 2.6564204057906828e-05,
"loss": 0.0314,
"num_input_tokens_seen": 520188928,
"step": 508000
},
{
"epoch": 4.691781769867412,
"grad_norm": 0.9526101350784302,
"learning_rate": 2.6541137284210332e-05,
"loss": 0.033,
"num_input_tokens_seen": 520700928,
"step": 508500
},
{
"epoch": 4.6963951246067115,
"grad_norm": 0.8857848048210144,
"learning_rate": 2.6518070510513836e-05,
"loss": 0.0333,
"num_input_tokens_seen": 521212928,
"step": 509000
},
{
"epoch": 4.701008479346011,
"grad_norm": 1.5435466766357422,
"learning_rate": 2.6495003736817343e-05,
"loss": 0.0319,
"num_input_tokens_seen": 521724928,
"step": 509500
},
{
"epoch": 4.70562183408531,
"grad_norm": 0.6249234676361084,
"learning_rate": 2.647193696312084e-05,
"loss": 0.0307,
"num_input_tokens_seen": 522236928,
"step": 510000
},
{
"epoch": 4.710235188824609,
"grad_norm": 0.7634549140930176,
"learning_rate": 2.6448870189424348e-05,
"loss": 0.0325,
"num_input_tokens_seen": 522748928,
"step": 510500
},
{
"epoch": 4.7148485435639085,
"grad_norm": 0.8510231375694275,
"learning_rate": 2.6425803415727852e-05,
"loss": 0.0323,
"num_input_tokens_seen": 523260928,
"step": 511000
},
{
"epoch": 4.719461898303209,
"grad_norm": 0.797269344329834,
"learning_rate": 2.6402736642031356e-05,
"loss": 0.035,
"num_input_tokens_seen": 523772928,
"step": 511500
},
{
"epoch": 4.724075253042507,
"grad_norm": 1.6006139516830444,
"learning_rate": 2.6379669868334856e-05,
"loss": 0.0311,
"num_input_tokens_seen": 524284928,
"step": 512000
},
{
"epoch": 4.728688607781807,
"grad_norm": 0.5628824234008789,
"learning_rate": 2.635660309463836e-05,
"loss": 0.0298,
"num_input_tokens_seen": 524796928,
"step": 512500
},
{
"epoch": 4.733301962521106,
"grad_norm": 1.2842258214950562,
"learning_rate": 2.6333536320941864e-05,
"loss": 0.0329,
"num_input_tokens_seen": 525308928,
"step": 513000
},
{
"epoch": 4.737915317260406,
"grad_norm": 1.3331750631332397,
"learning_rate": 2.6310469547245365e-05,
"loss": 0.0346,
"num_input_tokens_seen": 525820928,
"step": 513500
},
{
"epoch": 4.742528671999705,
"grad_norm": 2.3819310665130615,
"learning_rate": 2.628740277354887e-05,
"loss": 0.0339,
"num_input_tokens_seen": 526332928,
"step": 514000
},
{
"epoch": 4.747142026739004,
"grad_norm": 0.8976543545722961,
"learning_rate": 2.6264335999852373e-05,
"loss": 0.035,
"num_input_tokens_seen": 526844928,
"step": 514500
},
{
"epoch": 4.751755381478303,
"grad_norm": 2.7922868728637695,
"learning_rate": 2.624126922615588e-05,
"loss": 0.0344,
"num_input_tokens_seen": 527356928,
"step": 515000
},
{
"epoch": 4.756368736217603,
"grad_norm": 1.2664451599121094,
"learning_rate": 2.6218202452459377e-05,
"loss": 0.033,
"num_input_tokens_seen": 527868928,
"step": 515500
},
{
"epoch": 4.760982090956902,
"grad_norm": 1.8173182010650635,
"learning_rate": 2.6195135678762885e-05,
"loss": 0.033,
"num_input_tokens_seen": 528380928,
"step": 516000
},
{
"epoch": 4.765595445696201,
"grad_norm": 1.2038295269012451,
"learning_rate": 2.617206890506639e-05,
"loss": 0.0329,
"num_input_tokens_seen": 528892928,
"step": 516500
},
{
"epoch": 4.7702088004355,
"grad_norm": 1.3875302076339722,
"learning_rate": 2.6149002131369893e-05,
"loss": 0.0337,
"num_input_tokens_seen": 529404928,
"step": 517000
},
{
"epoch": 4.7748221551748,
"grad_norm": 0.6060103178024292,
"learning_rate": 2.6125935357673393e-05,
"loss": 0.0331,
"num_input_tokens_seen": 529916928,
"step": 517500
},
{
"epoch": 4.7794355099141,
"grad_norm": 3.217010259628296,
"learning_rate": 2.6102868583976897e-05,
"loss": 0.0365,
"num_input_tokens_seen": 530428928,
"step": 518000
},
{
"epoch": 4.784048864653399,
"grad_norm": 1.3630263805389404,
"learning_rate": 2.60798018102804e-05,
"loss": 0.0352,
"num_input_tokens_seen": 530940928,
"step": 518500
},
{
"epoch": 4.788662219392698,
"grad_norm": 1.875205397605896,
"learning_rate": 2.605673503658391e-05,
"loss": 0.0312,
"num_input_tokens_seen": 531452928,
"step": 519000
},
{
"epoch": 4.7932755741319975,
"grad_norm": 1.0889365673065186,
"learning_rate": 2.6033668262887406e-05,
"loss": 0.032,
"num_input_tokens_seen": 531964928,
"step": 519500
},
{
"epoch": 4.797888928871297,
"grad_norm": 1.8945229053497314,
"learning_rate": 2.601060148919091e-05,
"loss": 0.0318,
"num_input_tokens_seen": 532476928,
"step": 520000
},
{
"epoch": 4.802502283610596,
"grad_norm": 0.8704883456230164,
"learning_rate": 2.5987534715494417e-05,
"loss": 0.0353,
"num_input_tokens_seen": 532988928,
"step": 520500
},
{
"epoch": 4.807115638349895,
"grad_norm": 0.5920878052711487,
"learning_rate": 2.5964467941797914e-05,
"loss": 0.0352,
"num_input_tokens_seen": 533500928,
"step": 521000
},
{
"epoch": 4.811728993089194,
"grad_norm": 1.7447361946105957,
"learning_rate": 2.594140116810142e-05,
"loss": 0.0333,
"num_input_tokens_seen": 534012928,
"step": 521500
},
{
"epoch": 4.816342347828494,
"grad_norm": 2.5715444087982178,
"learning_rate": 2.5918334394404926e-05,
"loss": 0.0331,
"num_input_tokens_seen": 534524928,
"step": 522000
},
{
"epoch": 4.820955702567793,
"grad_norm": 1.5223846435546875,
"learning_rate": 2.589526762070843e-05,
"loss": 0.0326,
"num_input_tokens_seen": 535036928,
"step": 522500
},
{
"epoch": 4.825569057307092,
"grad_norm": 1.0512726306915283,
"learning_rate": 2.587220084701193e-05,
"loss": 0.0312,
"num_input_tokens_seen": 535548928,
"step": 523000
},
{
"epoch": 4.830182412046392,
"grad_norm": 1.2424243688583374,
"learning_rate": 2.5849134073315434e-05,
"loss": 0.0356,
"num_input_tokens_seen": 536060928,
"step": 523500
},
{
"epoch": 4.834795766785692,
"grad_norm": 1.2689915895462036,
"learning_rate": 2.5826067299618938e-05,
"loss": 0.0317,
"num_input_tokens_seen": 536572928,
"step": 524000
},
{
"epoch": 4.839409121524991,
"grad_norm": 0.5996227860450745,
"learning_rate": 2.5803000525922445e-05,
"loss": 0.0318,
"num_input_tokens_seen": 537084928,
"step": 524500
},
{
"epoch": 4.84402247626429,
"grad_norm": 1.7113879919052124,
"learning_rate": 2.5779933752225943e-05,
"loss": 0.0322,
"num_input_tokens_seen": 537596928,
"step": 525000
},
{
"epoch": 4.848635831003589,
"grad_norm": 5.173702239990234,
"learning_rate": 2.5756866978529447e-05,
"loss": 0.0338,
"num_input_tokens_seen": 538108928,
"step": 525500
},
{
"epoch": 4.8532491857428885,
"grad_norm": 2.208484172821045,
"learning_rate": 2.5733800204832954e-05,
"loss": 0.0335,
"num_input_tokens_seen": 538620928,
"step": 526000
},
{
"epoch": 4.857862540482188,
"grad_norm": 0.7695846557617188,
"learning_rate": 2.571073343113645e-05,
"loss": 0.0323,
"num_input_tokens_seen": 539132928,
"step": 526500
},
{
"epoch": 4.862475895221487,
"grad_norm": 0.6419717073440552,
"learning_rate": 2.568766665743996e-05,
"loss": 0.0313,
"num_input_tokens_seen": 539644928,
"step": 527000
},
{
"epoch": 4.867089249960786,
"grad_norm": 0.4510629177093506,
"learning_rate": 2.5664599883743462e-05,
"loss": 0.0323,
"num_input_tokens_seen": 540156928,
"step": 527500
},
{
"epoch": 4.8717026047000855,
"grad_norm": 0.6697828769683838,
"learning_rate": 2.5641533110046966e-05,
"loss": 0.0306,
"num_input_tokens_seen": 540668928,
"step": 528000
},
{
"epoch": 4.876315959439385,
"grad_norm": 0.30349186062812805,
"learning_rate": 2.5618466336350467e-05,
"loss": 0.0354,
"num_input_tokens_seen": 541180928,
"step": 528500
},
{
"epoch": 4.880929314178685,
"grad_norm": 0.9010013937950134,
"learning_rate": 2.559539956265397e-05,
"loss": 0.0334,
"num_input_tokens_seen": 541692928,
"step": 529000
},
{
"epoch": 4.885542668917984,
"grad_norm": 5.212312698364258,
"learning_rate": 2.5572332788957475e-05,
"loss": 0.0338,
"num_input_tokens_seen": 542204928,
"step": 529500
},
{
"epoch": 4.890156023657283,
"grad_norm": 0.5742513537406921,
"learning_rate": 2.5549266015260982e-05,
"loss": 0.0331,
"num_input_tokens_seen": 542716928,
"step": 530000
},
{
"epoch": 4.894769378396583,
"grad_norm": 1.1083173751831055,
"learning_rate": 2.552619924156448e-05,
"loss": 0.0332,
"num_input_tokens_seen": 543228928,
"step": 530500
},
{
"epoch": 4.899382733135882,
"grad_norm": 2.323056697845459,
"learning_rate": 2.5503132467867983e-05,
"loss": 0.0316,
"num_input_tokens_seen": 543740928,
"step": 531000
},
{
"epoch": 4.903996087875181,
"grad_norm": 0.8404493927955627,
"learning_rate": 2.548006569417149e-05,
"loss": 0.0325,
"num_input_tokens_seen": 544252928,
"step": 531500
},
{
"epoch": 4.90860944261448,
"grad_norm": 0.7807884216308594,
"learning_rate": 2.5456998920474995e-05,
"loss": 0.034,
"num_input_tokens_seen": 544764928,
"step": 532000
},
{
"epoch": 4.91322279735378,
"grad_norm": 1.5149301290512085,
"learning_rate": 2.5433932146778495e-05,
"loss": 0.0329,
"num_input_tokens_seen": 545276928,
"step": 532500
},
{
"epoch": 4.917836152093079,
"grad_norm": 2.3330907821655273,
"learning_rate": 2.5410865373082e-05,
"loss": 0.0315,
"num_input_tokens_seen": 545788928,
"step": 533000
},
{
"epoch": 4.922449506832378,
"grad_norm": 0.9304101467132568,
"learning_rate": 2.5387798599385503e-05,
"loss": 0.0316,
"num_input_tokens_seen": 546300928,
"step": 533500
},
{
"epoch": 4.927062861571677,
"grad_norm": 1.3839999437332153,
"learning_rate": 2.5364731825689004e-05,
"loss": 0.0339,
"num_input_tokens_seen": 546812928,
"step": 534000
},
{
"epoch": 4.931676216310977,
"grad_norm": 1.3032892942428589,
"learning_rate": 2.5341665051992508e-05,
"loss": 0.0377,
"num_input_tokens_seen": 547324928,
"step": 534500
},
{
"epoch": 4.936289571050276,
"grad_norm": 0.5184182524681091,
"learning_rate": 2.5318598278296012e-05,
"loss": 0.0327,
"num_input_tokens_seen": 547836928,
"step": 535000
},
{
"epoch": 4.940902925789576,
"grad_norm": 4.176392078399658,
"learning_rate": 2.529553150459952e-05,
"loss": 0.0311,
"num_input_tokens_seen": 548348928,
"step": 535500
},
{
"epoch": 4.945516280528875,
"grad_norm": 1.8942577838897705,
"learning_rate": 2.5272464730903016e-05,
"loss": 0.0312,
"num_input_tokens_seen": 548860928,
"step": 536000
},
{
"epoch": 4.9501296352681745,
"grad_norm": 0.4011167585849762,
"learning_rate": 2.524939795720652e-05,
"loss": 0.0297,
"num_input_tokens_seen": 549372928,
"step": 536500
},
{
"epoch": 4.954742990007474,
"grad_norm": 1.2499672174453735,
"learning_rate": 2.5226331183510028e-05,
"loss": 0.0351,
"num_input_tokens_seen": 549884928,
"step": 537000
},
{
"epoch": 4.959356344746773,
"grad_norm": 1.7503982782363892,
"learning_rate": 2.520326440981353e-05,
"loss": 0.0346,
"num_input_tokens_seen": 550396928,
"step": 537500
},
{
"epoch": 4.963969699486072,
"grad_norm": 0.9771599173545837,
"learning_rate": 2.5180197636117032e-05,
"loss": 0.0344,
"num_input_tokens_seen": 550908928,
"step": 538000
},
{
"epoch": 4.9685830542253715,
"grad_norm": 1.7374619245529175,
"learning_rate": 2.5157130862420536e-05,
"loss": 0.0328,
"num_input_tokens_seen": 551420928,
"step": 538500
},
{
"epoch": 4.973196408964671,
"grad_norm": 2.459627866744995,
"learning_rate": 2.513406408872404e-05,
"loss": 0.0304,
"num_input_tokens_seen": 551932928,
"step": 539000
},
{
"epoch": 4.97780976370397,
"grad_norm": 1.0150238275527954,
"learning_rate": 2.511099731502754e-05,
"loss": 0.0341,
"num_input_tokens_seen": 552444928,
"step": 539500
},
{
"epoch": 4.982423118443269,
"grad_norm": 0.5386485457420349,
"learning_rate": 2.5087930541331045e-05,
"loss": 0.0323,
"num_input_tokens_seen": 552956928,
"step": 540000
},
{
"epoch": 4.9870364731825685,
"grad_norm": 2.0339949131011963,
"learning_rate": 2.506486376763455e-05,
"loss": 0.0308,
"num_input_tokens_seen": 553468928,
"step": 540500
},
{
"epoch": 4.991649827921869,
"grad_norm": 0.7838632464408875,
"learning_rate": 2.5041796993938056e-05,
"loss": 0.0335,
"num_input_tokens_seen": 553980928,
"step": 541000
},
{
"epoch": 4.996263182661168,
"grad_norm": 1.2253855466842651,
"learning_rate": 2.5018730220241553e-05,
"loss": 0.0325,
"num_input_tokens_seen": 554492928,
"step": 541500
},
{
"epoch": 5.0,
"eval_combined_score": 0.0704431934497777,
"eval_loss": 0.07044319063425064,
"eval_mse": 0.07044319626530475,
"eval_runtime": 45.8855,
"eval_samples_per_second": 2099.529,
"eval_steps_per_second": 262.457,
"num_input_tokens_seen": 554906880,
"step": 541905
},
{
"epoch": 5.000876537400467,
"grad_norm": 1.9685852527618408,
"learning_rate": 2.499566344654506e-05,
"loss": 0.0352,
"num_input_tokens_seen": 555004160,
"step": 542000
},
{
"epoch": 5.005489892139766,
"grad_norm": 1.419827938079834,
"learning_rate": 2.4972596672848565e-05,
"loss": 0.0302,
"num_input_tokens_seen": 555516160,
"step": 542500
},
{
"epoch": 5.010103246879066,
"grad_norm": 3.999183177947998,
"learning_rate": 2.4949529899152065e-05,
"loss": 0.0242,
"num_input_tokens_seen": 556028160,
"step": 543000
},
{
"epoch": 5.014716601618365,
"grad_norm": 1.758694052696228,
"learning_rate": 2.4926463125455573e-05,
"loss": 0.0261,
"num_input_tokens_seen": 556540160,
"step": 543500
},
{
"epoch": 5.019329956357664,
"grad_norm": 1.1982614994049072,
"learning_rate": 2.4903396351759073e-05,
"loss": 0.0245,
"num_input_tokens_seen": 557052160,
"step": 544000
},
{
"epoch": 5.023943311096963,
"grad_norm": 0.8155698180198669,
"learning_rate": 2.4880329578062577e-05,
"loss": 0.0252,
"num_input_tokens_seen": 557564160,
"step": 544500
},
{
"epoch": 5.028556665836263,
"grad_norm": 0.5454326272010803,
"learning_rate": 2.485726280436608e-05,
"loss": 0.0243,
"num_input_tokens_seen": 558076160,
"step": 545000
},
{
"epoch": 5.033170020575562,
"grad_norm": 0.35681942105293274,
"learning_rate": 2.4834196030669585e-05,
"loss": 0.0259,
"num_input_tokens_seen": 558588160,
"step": 545500
},
{
"epoch": 5.037783375314861,
"grad_norm": 1.3723911046981812,
"learning_rate": 2.4811129256973086e-05,
"loss": 0.0254,
"num_input_tokens_seen": 559100160,
"step": 546000
},
{
"epoch": 5.042396730054161,
"grad_norm": 2.3160240650177,
"learning_rate": 2.478806248327659e-05,
"loss": 0.0238,
"num_input_tokens_seen": 559612160,
"step": 546500
},
{
"epoch": 5.0470100847934605,
"grad_norm": 0.447410523891449,
"learning_rate": 2.4764995709580094e-05,
"loss": 0.0245,
"num_input_tokens_seen": 560124160,
"step": 547000
},
{
"epoch": 5.05162343953276,
"grad_norm": 1.798653483390808,
"learning_rate": 2.4741928935883598e-05,
"loss": 0.0264,
"num_input_tokens_seen": 560636160,
"step": 547500
},
{
"epoch": 5.056236794272059,
"grad_norm": 0.5568801164627075,
"learning_rate": 2.47188621621871e-05,
"loss": 0.0262,
"num_input_tokens_seen": 561148160,
"step": 548000
},
{
"epoch": 5.060850149011358,
"grad_norm": 0.5296237468719482,
"learning_rate": 2.4695795388490602e-05,
"loss": 0.0257,
"num_input_tokens_seen": 561660160,
"step": 548500
},
{
"epoch": 5.0654635037506575,
"grad_norm": 1.8144594430923462,
"learning_rate": 2.467272861479411e-05,
"loss": 0.0244,
"num_input_tokens_seen": 562172160,
"step": 549000
},
{
"epoch": 5.070076858489957,
"grad_norm": 1.125553846359253,
"learning_rate": 2.464966184109761e-05,
"loss": 0.0278,
"num_input_tokens_seen": 562684160,
"step": 549500
},
{
"epoch": 5.074690213229256,
"grad_norm": 1.2279289960861206,
"learning_rate": 2.4626595067401114e-05,
"loss": 0.0254,
"num_input_tokens_seen": 563196160,
"step": 550000
},
{
"epoch": 5.079303567968555,
"grad_norm": 1.1253972053527832,
"learning_rate": 2.4603528293704618e-05,
"loss": 0.0273,
"num_input_tokens_seen": 563708160,
"step": 550500
},
{
"epoch": 5.0839169227078544,
"grad_norm": 1.958179235458374,
"learning_rate": 2.4580461520008122e-05,
"loss": 0.0227,
"num_input_tokens_seen": 564220160,
"step": 551000
},
{
"epoch": 5.088530277447154,
"grad_norm": 1.6592975854873657,
"learning_rate": 2.4557394746311622e-05,
"loss": 0.0265,
"num_input_tokens_seen": 564732160,
"step": 551500
},
{
"epoch": 5.093143632186453,
"grad_norm": 0.9499948024749756,
"learning_rate": 2.453432797261513e-05,
"loss": 0.0257,
"num_input_tokens_seen": 565244160,
"step": 552000
},
{
"epoch": 5.097756986925753,
"grad_norm": 0.7857697606086731,
"learning_rate": 2.451126119891863e-05,
"loss": 0.0256,
"num_input_tokens_seen": 565756160,
"step": 552500
},
{
"epoch": 5.102370341665052,
"grad_norm": 1.4605727195739746,
"learning_rate": 2.4488194425222134e-05,
"loss": 0.0241,
"num_input_tokens_seen": 566268160,
"step": 553000
},
{
"epoch": 5.106983696404352,
"grad_norm": 1.2469509840011597,
"learning_rate": 2.446512765152564e-05,
"loss": 0.0248,
"num_input_tokens_seen": 566780160,
"step": 553500
},
{
"epoch": 5.111597051143651,
"grad_norm": 1.826318383216858,
"learning_rate": 2.444206087782914e-05,
"loss": 0.0288,
"num_input_tokens_seen": 567292160,
"step": 554000
},
{
"epoch": 5.11621040588295,
"grad_norm": 4.358790397644043,
"learning_rate": 2.4418994104132646e-05,
"loss": 0.0248,
"num_input_tokens_seen": 567804160,
"step": 554500
},
{
"epoch": 5.120823760622249,
"grad_norm": 1.07144033908844,
"learning_rate": 2.4395927330436147e-05,
"loss": 0.0266,
"num_input_tokens_seen": 568316160,
"step": 555000
},
{
"epoch": 5.125437115361549,
"grad_norm": 1.7916905879974365,
"learning_rate": 2.437286055673965e-05,
"loss": 0.0268,
"num_input_tokens_seen": 568828160,
"step": 555500
},
{
"epoch": 5.130050470100848,
"grad_norm": 0.9158410429954529,
"learning_rate": 2.4349793783043155e-05,
"loss": 0.0263,
"num_input_tokens_seen": 569340160,
"step": 556000
},
{
"epoch": 5.134663824840147,
"grad_norm": 0.7724267244338989,
"learning_rate": 2.432672700934666e-05,
"loss": 0.0244,
"num_input_tokens_seen": 569852160,
"step": 556500
},
{
"epoch": 5.139277179579446,
"grad_norm": 0.48507311940193176,
"learning_rate": 2.430366023565016e-05,
"loss": 0.0274,
"num_input_tokens_seen": 570364160,
"step": 557000
},
{
"epoch": 5.1438905343187455,
"grad_norm": 0.6313498616218567,
"learning_rate": 2.4280593461953667e-05,
"loss": 0.0239,
"num_input_tokens_seen": 570876160,
"step": 557500
},
{
"epoch": 5.148503889058045,
"grad_norm": 0.987579345703125,
"learning_rate": 2.4257526688257167e-05,
"loss": 0.0255,
"num_input_tokens_seen": 571388160,
"step": 558000
},
{
"epoch": 5.153117243797345,
"grad_norm": 1.7795839309692383,
"learning_rate": 2.423445991456067e-05,
"loss": 0.0245,
"num_input_tokens_seen": 571900160,
"step": 558500
},
{
"epoch": 5.157730598536644,
"grad_norm": 1.233028531074524,
"learning_rate": 2.4211393140864175e-05,
"loss": 0.0272,
"num_input_tokens_seen": 572412160,
"step": 559000
},
{
"epoch": 5.162343953275943,
"grad_norm": 0.9197332262992859,
"learning_rate": 2.4188326367167676e-05,
"loss": 0.0243,
"num_input_tokens_seen": 572924160,
"step": 559500
},
{
"epoch": 5.166957308015243,
"grad_norm": 5.717777252197266,
"learning_rate": 2.4165259593471183e-05,
"loss": 0.024,
"num_input_tokens_seen": 573436160,
"step": 560000
},
{
"epoch": 5.171570662754542,
"grad_norm": 0.8062294721603394,
"learning_rate": 2.4142192819774684e-05,
"loss": 0.025,
"num_input_tokens_seen": 573948160,
"step": 560500
},
{
"epoch": 5.176184017493841,
"grad_norm": 1.5993818044662476,
"learning_rate": 2.4119126046078188e-05,
"loss": 0.0276,
"num_input_tokens_seen": 574460160,
"step": 561000
},
{
"epoch": 5.18079737223314,
"grad_norm": 1.086608648300171,
"learning_rate": 2.4096059272381692e-05,
"loss": 0.0237,
"num_input_tokens_seen": 574972160,
"step": 561500
},
{
"epoch": 5.18541072697244,
"grad_norm": 0.5633468627929688,
"learning_rate": 2.4072992498685196e-05,
"loss": 0.0267,
"num_input_tokens_seen": 575484160,
"step": 562000
},
{
"epoch": 5.190024081711739,
"grad_norm": 0.9681257605552673,
"learning_rate": 2.4049925724988696e-05,
"loss": 0.0247,
"num_input_tokens_seen": 575996160,
"step": 562500
},
{
"epoch": 5.194637436451038,
"grad_norm": 0.5693821907043457,
"learning_rate": 2.4026858951292204e-05,
"loss": 0.0262,
"num_input_tokens_seen": 576508160,
"step": 563000
},
{
"epoch": 5.199250791190337,
"grad_norm": 0.5459065437316895,
"learning_rate": 2.4003792177595704e-05,
"loss": 0.0246,
"num_input_tokens_seen": 577020160,
"step": 563500
},
{
"epoch": 5.2038641459296375,
"grad_norm": 0.8124216198921204,
"learning_rate": 2.3980725403899208e-05,
"loss": 0.0261,
"num_input_tokens_seen": 577532160,
"step": 564000
},
{
"epoch": 5.208477500668937,
"grad_norm": 2.0479400157928467,
"learning_rate": 2.3957658630202712e-05,
"loss": 0.0263,
"num_input_tokens_seen": 578044160,
"step": 564500
},
{
"epoch": 5.213090855408236,
"grad_norm": 0.4062500596046448,
"learning_rate": 2.3934591856506216e-05,
"loss": 0.0293,
"num_input_tokens_seen": 578556160,
"step": 565000
},
{
"epoch": 5.217704210147535,
"grad_norm": 0.6792827844619751,
"learning_rate": 2.391152508280972e-05,
"loss": 0.0243,
"num_input_tokens_seen": 579068160,
"step": 565500
},
{
"epoch": 5.2223175648868345,
"grad_norm": 1.978621482849121,
"learning_rate": 2.388845830911322e-05,
"loss": 0.0242,
"num_input_tokens_seen": 579580160,
"step": 566000
},
{
"epoch": 5.226930919626134,
"grad_norm": 1.0961169004440308,
"learning_rate": 2.3865391535416725e-05,
"loss": 0.0264,
"num_input_tokens_seen": 580092160,
"step": 566500
},
{
"epoch": 5.231544274365433,
"grad_norm": 2.3269541263580322,
"learning_rate": 2.384232476172023e-05,
"loss": 0.0246,
"num_input_tokens_seen": 580604160,
"step": 567000
},
{
"epoch": 5.236157629104732,
"grad_norm": 0.545312762260437,
"learning_rate": 2.3819257988023733e-05,
"loss": 0.0259,
"num_input_tokens_seen": 581116160,
"step": 567500
},
{
"epoch": 5.2407709838440315,
"grad_norm": 0.7577276825904846,
"learning_rate": 2.3796191214327233e-05,
"loss": 0.026,
"num_input_tokens_seen": 581628160,
"step": 568000
},
{
"epoch": 5.245384338583331,
"grad_norm": 0.5405977964401245,
"learning_rate": 2.377312444063074e-05,
"loss": 0.0232,
"num_input_tokens_seen": 582140160,
"step": 568500
},
{
"epoch": 5.24999769332263,
"grad_norm": 0.5924959182739258,
"learning_rate": 2.375005766693424e-05,
"loss": 0.0264,
"num_input_tokens_seen": 582652160,
"step": 569000
},
{
"epoch": 5.25461104806193,
"grad_norm": 1.2683016061782837,
"learning_rate": 2.3726990893237745e-05,
"loss": 0.0262,
"num_input_tokens_seen": 583164160,
"step": 569500
},
{
"epoch": 5.259224402801229,
"grad_norm": 1.1642249822616577,
"learning_rate": 2.370392411954125e-05,
"loss": 0.0263,
"num_input_tokens_seen": 583676160,
"step": 570000
},
{
"epoch": 5.263837757540529,
"grad_norm": 1.1712781190872192,
"learning_rate": 2.3680857345844753e-05,
"loss": 0.0254,
"num_input_tokens_seen": 584188160,
"step": 570500
},
{
"epoch": 5.268451112279828,
"grad_norm": 1.0108134746551514,
"learning_rate": 2.3657790572148257e-05,
"loss": 0.0256,
"num_input_tokens_seen": 584700160,
"step": 571000
},
{
"epoch": 5.273064467019127,
"grad_norm": 2.7338948249816895,
"learning_rate": 2.363472379845176e-05,
"loss": 0.0275,
"num_input_tokens_seen": 585212160,
"step": 571500
},
{
"epoch": 5.277677821758426,
"grad_norm": 0.6406319737434387,
"learning_rate": 2.361165702475526e-05,
"loss": 0.0238,
"num_input_tokens_seen": 585724160,
"step": 572000
},
{
"epoch": 5.282291176497726,
"grad_norm": 1.551131010055542,
"learning_rate": 2.3588590251058766e-05,
"loss": 0.0261,
"num_input_tokens_seen": 586236160,
"step": 572500
},
{
"epoch": 5.286904531237025,
"grad_norm": 0.41061103343963623,
"learning_rate": 2.356552347736227e-05,
"loss": 0.0281,
"num_input_tokens_seen": 586748160,
"step": 573000
},
{
"epoch": 5.291517885976324,
"grad_norm": 0.7769986987113953,
"learning_rate": 2.354245670366577e-05,
"loss": 0.0251,
"num_input_tokens_seen": 587260160,
"step": 573500
},
{
"epoch": 5.296131240715623,
"grad_norm": 1.0587828159332275,
"learning_rate": 2.3519389929969277e-05,
"loss": 0.024,
"num_input_tokens_seen": 587772160,
"step": 574000
},
{
"epoch": 5.300744595454923,
"grad_norm": 0.7457670569419861,
"learning_rate": 2.3496323156272778e-05,
"loss": 0.0258,
"num_input_tokens_seen": 588284160,
"step": 574500
},
{
"epoch": 5.305357950194222,
"grad_norm": 1.7087829113006592,
"learning_rate": 2.3473256382576282e-05,
"loss": 0.0265,
"num_input_tokens_seen": 588796160,
"step": 575000
},
{
"epoch": 5.309971304933521,
"grad_norm": 1.6121881008148193,
"learning_rate": 2.3450189608879786e-05,
"loss": 0.0236,
"num_input_tokens_seen": 589308160,
"step": 575500
},
{
"epoch": 5.314584659672821,
"grad_norm": 1.585402011871338,
"learning_rate": 2.342712283518329e-05,
"loss": 0.0253,
"num_input_tokens_seen": 589820160,
"step": 576000
},
{
"epoch": 5.3191980144121205,
"grad_norm": 2.160334348678589,
"learning_rate": 2.3404056061486794e-05,
"loss": 0.0266,
"num_input_tokens_seen": 590332160,
"step": 576500
},
{
"epoch": 5.32381136915142,
"grad_norm": 0.304321825504303,
"learning_rate": 2.3380989287790298e-05,
"loss": 0.0268,
"num_input_tokens_seen": 590844160,
"step": 577000
},
{
"epoch": 5.328424723890719,
"grad_norm": 0.9023957848548889,
"learning_rate": 2.33579225140938e-05,
"loss": 0.0258,
"num_input_tokens_seen": 591356160,
"step": 577500
},
{
"epoch": 5.333038078630018,
"grad_norm": 0.5087705254554749,
"learning_rate": 2.3334855740397306e-05,
"loss": 0.026,
"num_input_tokens_seen": 591868160,
"step": 578000
},
{
"epoch": 5.3376514333693175,
"grad_norm": 1.3647748231887817,
"learning_rate": 2.3311788966700806e-05,
"loss": 0.0268,
"num_input_tokens_seen": 592380160,
"step": 578500
},
{
"epoch": 5.342264788108617,
"grad_norm": 1.011982798576355,
"learning_rate": 2.328872219300431e-05,
"loss": 0.0267,
"num_input_tokens_seen": 592892160,
"step": 579000
},
{
"epoch": 5.346878142847916,
"grad_norm": 1.695412516593933,
"learning_rate": 2.3265655419307814e-05,
"loss": 0.0244,
"num_input_tokens_seen": 593404160,
"step": 579500
},
{
"epoch": 5.351491497587215,
"grad_norm": 2.6255669593811035,
"learning_rate": 2.3242588645611315e-05,
"loss": 0.0279,
"num_input_tokens_seen": 593916160,
"step": 580000
},
{
"epoch": 5.3561048523265145,
"grad_norm": 1.49470055103302,
"learning_rate": 2.321952187191482e-05,
"loss": 0.0255,
"num_input_tokens_seen": 594428160,
"step": 580500
},
{
"epoch": 5.360718207065814,
"grad_norm": 5.862457275390625,
"learning_rate": 2.3196455098218323e-05,
"loss": 0.0272,
"num_input_tokens_seen": 594940160,
"step": 581000
},
{
"epoch": 5.365331561805114,
"grad_norm": 1.1416678428649902,
"learning_rate": 2.3173388324521827e-05,
"loss": 0.0257,
"num_input_tokens_seen": 595452160,
"step": 581500
},
{
"epoch": 5.369944916544413,
"grad_norm": 1.0137473344802856,
"learning_rate": 2.315032155082533e-05,
"loss": 0.0278,
"num_input_tokens_seen": 595964160,
"step": 582000
},
{
"epoch": 5.374558271283712,
"grad_norm": 1.037350058555603,
"learning_rate": 2.3127254777128835e-05,
"loss": 0.0242,
"num_input_tokens_seen": 596476160,
"step": 582500
},
{
"epoch": 5.379171626023012,
"grad_norm": 0.5939755439758301,
"learning_rate": 2.3104188003432335e-05,
"loss": 0.0253,
"num_input_tokens_seen": 596988160,
"step": 583000
},
{
"epoch": 5.383784980762311,
"grad_norm": 0.8637872934341431,
"learning_rate": 2.3081121229735843e-05,
"loss": 0.0294,
"num_input_tokens_seen": 597500160,
"step": 583500
},
{
"epoch": 5.38839833550161,
"grad_norm": 0.6153502464294434,
"learning_rate": 2.3058054456039343e-05,
"loss": 0.0252,
"num_input_tokens_seen": 598012160,
"step": 584000
},
{
"epoch": 5.393011690240909,
"grad_norm": 0.7826283574104309,
"learning_rate": 2.3034987682342847e-05,
"loss": 0.0242,
"num_input_tokens_seen": 598524160,
"step": 584500
},
{
"epoch": 5.397625044980209,
"grad_norm": 0.8609397411346436,
"learning_rate": 2.301192090864635e-05,
"loss": 0.0281,
"num_input_tokens_seen": 599036160,
"step": 585000
},
{
"epoch": 5.402238399719508,
"grad_norm": 1.031718134880066,
"learning_rate": 2.2988854134949852e-05,
"loss": 0.0264,
"num_input_tokens_seen": 599548160,
"step": 585500
},
{
"epoch": 5.406851754458807,
"grad_norm": 4.244394779205322,
"learning_rate": 2.296578736125336e-05,
"loss": 0.0284,
"num_input_tokens_seen": 600060160,
"step": 586000
},
{
"epoch": 5.411465109198106,
"grad_norm": 0.6755638122558594,
"learning_rate": 2.294272058755686e-05,
"loss": 0.0256,
"num_input_tokens_seen": 600572160,
"step": 586500
},
{
"epoch": 5.416078463937406,
"grad_norm": 0.5303651690483093,
"learning_rate": 2.2919653813860364e-05,
"loss": 0.0272,
"num_input_tokens_seen": 601084160,
"step": 587000
},
{
"epoch": 5.420691818676706,
"grad_norm": 0.8649631142616272,
"learning_rate": 2.2896587040163868e-05,
"loss": 0.0245,
"num_input_tokens_seen": 601596160,
"step": 587500
},
{
"epoch": 5.425305173416005,
"grad_norm": 0.5191958546638489,
"learning_rate": 2.287352026646737e-05,
"loss": 0.0271,
"num_input_tokens_seen": 602108160,
"step": 588000
},
{
"epoch": 5.429918528155304,
"grad_norm": 1.2616572380065918,
"learning_rate": 2.2850453492770872e-05,
"loss": 0.0271,
"num_input_tokens_seen": 602620160,
"step": 588500
},
{
"epoch": 5.434531882894603,
"grad_norm": 0.8619266152381897,
"learning_rate": 2.282738671907438e-05,
"loss": 0.0262,
"num_input_tokens_seen": 603132160,
"step": 589000
},
{
"epoch": 5.439145237633903,
"grad_norm": 0.7039788961410522,
"learning_rate": 2.280431994537788e-05,
"loss": 0.0247,
"num_input_tokens_seen": 603644160,
"step": 589500
},
{
"epoch": 5.443758592373202,
"grad_norm": 2.772310495376587,
"learning_rate": 2.2781253171681384e-05,
"loss": 0.0267,
"num_input_tokens_seen": 604156160,
"step": 590000
},
{
"epoch": 5.448371947112501,
"grad_norm": 0.5451655387878418,
"learning_rate": 2.2758186397984888e-05,
"loss": 0.0261,
"num_input_tokens_seen": 604668160,
"step": 590500
},
{
"epoch": 5.4529853018518,
"grad_norm": 0.8995614647865295,
"learning_rate": 2.2735119624288392e-05,
"loss": 0.024,
"num_input_tokens_seen": 605180160,
"step": 591000
},
{
"epoch": 5.4575986565911,
"grad_norm": 1.981187105178833,
"learning_rate": 2.2712052850591896e-05,
"loss": 0.0263,
"num_input_tokens_seen": 605692160,
"step": 591500
},
{
"epoch": 5.462212011330399,
"grad_norm": 0.7811481952667236,
"learning_rate": 2.2688986076895397e-05,
"loss": 0.0286,
"num_input_tokens_seen": 606204160,
"step": 592000
},
{
"epoch": 5.466825366069698,
"grad_norm": 2.7757558822631836,
"learning_rate": 2.26659193031989e-05,
"loss": 0.0253,
"num_input_tokens_seen": 606716160,
"step": 592500
},
{
"epoch": 5.471438720808998,
"grad_norm": 1.9782260656356812,
"learning_rate": 2.2642852529502405e-05,
"loss": 0.0277,
"num_input_tokens_seen": 607228160,
"step": 593000
},
{
"epoch": 5.4760520755482975,
"grad_norm": 2.8401777744293213,
"learning_rate": 2.261978575580591e-05,
"loss": 0.0255,
"num_input_tokens_seen": 607740160,
"step": 593500
},
{
"epoch": 5.480665430287597,
"grad_norm": 0.5879292488098145,
"learning_rate": 2.259671898210941e-05,
"loss": 0.027,
"num_input_tokens_seen": 608252160,
"step": 594000
},
{
"epoch": 5.485278785026896,
"grad_norm": 1.1103825569152832,
"learning_rate": 2.2573652208412917e-05,
"loss": 0.0258,
"num_input_tokens_seen": 608764160,
"step": 594500
},
{
"epoch": 5.489892139766195,
"grad_norm": 1.002668857574463,
"learning_rate": 2.2550585434716417e-05,
"loss": 0.0276,
"num_input_tokens_seen": 609276160,
"step": 595000
},
{
"epoch": 5.4945054945054945,
"grad_norm": 0.5841794013977051,
"learning_rate": 2.252751866101992e-05,
"loss": 0.0272,
"num_input_tokens_seen": 609788160,
"step": 595500
},
{
"epoch": 5.499118849244794,
"grad_norm": 0.6137141585350037,
"learning_rate": 2.2504451887323425e-05,
"loss": 0.0269,
"num_input_tokens_seen": 610300160,
"step": 596000
},
{
"epoch": 5.503732203984093,
"grad_norm": 0.6018849015235901,
"learning_rate": 2.248138511362693e-05,
"loss": 0.0279,
"num_input_tokens_seen": 610812160,
"step": 596500
},
{
"epoch": 5.508345558723392,
"grad_norm": 1.4851562976837158,
"learning_rate": 2.2458318339930433e-05,
"loss": 0.0268,
"num_input_tokens_seen": 611324160,
"step": 597000
},
{
"epoch": 5.5129589134626915,
"grad_norm": 1.9454591274261475,
"learning_rate": 2.2435251566233937e-05,
"loss": 0.0258,
"num_input_tokens_seen": 611836160,
"step": 597500
},
{
"epoch": 5.517572268201991,
"grad_norm": 1.9615495204925537,
"learning_rate": 2.2412184792537438e-05,
"loss": 0.0257,
"num_input_tokens_seen": 612348160,
"step": 598000
},
{
"epoch": 5.52218562294129,
"grad_norm": 1.1803622245788574,
"learning_rate": 2.238911801884094e-05,
"loss": 0.0256,
"num_input_tokens_seen": 612860160,
"step": 598500
},
{
"epoch": 5.52679897768059,
"grad_norm": 0.7780105471611023,
"learning_rate": 2.2366051245144445e-05,
"loss": 0.0264,
"num_input_tokens_seen": 613372160,
"step": 599000
},
{
"epoch": 5.531412332419889,
"grad_norm": 0.5582423806190491,
"learning_rate": 2.2342984471447946e-05,
"loss": 0.0279,
"num_input_tokens_seen": 613884160,
"step": 599500
},
{
"epoch": 5.536025687159189,
"grad_norm": 1.4547449350357056,
"learning_rate": 2.2319917697751453e-05,
"loss": 0.0273,
"num_input_tokens_seen": 614396160,
"step": 600000
},
{
"epoch": 5.540639041898488,
"grad_norm": 1.0105394124984741,
"learning_rate": 2.2296850924054954e-05,
"loss": 0.0251,
"num_input_tokens_seen": 614908160,
"step": 600500
},
{
"epoch": 5.545252396637787,
"grad_norm": 0.7775139212608337,
"learning_rate": 2.2273784150358458e-05,
"loss": 0.0258,
"num_input_tokens_seen": 615420160,
"step": 601000
},
{
"epoch": 5.549865751377086,
"grad_norm": 0.40573227405548096,
"learning_rate": 2.2250717376661962e-05,
"loss": 0.0268,
"num_input_tokens_seen": 615932160,
"step": 601500
},
{
"epoch": 5.554479106116386,
"grad_norm": 1.130553126335144,
"learning_rate": 2.2227650602965466e-05,
"loss": 0.0255,
"num_input_tokens_seen": 616444160,
"step": 602000
},
{
"epoch": 5.559092460855685,
"grad_norm": 1.0450289249420166,
"learning_rate": 2.220458382926897e-05,
"loss": 0.0266,
"num_input_tokens_seen": 616956160,
"step": 602500
},
{
"epoch": 5.563705815594984,
"grad_norm": 0.7919219136238098,
"learning_rate": 2.2181517055572474e-05,
"loss": 0.0269,
"num_input_tokens_seen": 617468160,
"step": 603000
},
{
"epoch": 5.568319170334283,
"grad_norm": 0.7787536382675171,
"learning_rate": 2.2158450281875974e-05,
"loss": 0.0305,
"num_input_tokens_seen": 617980160,
"step": 603500
},
{
"epoch": 5.572932525073583,
"grad_norm": 1.2866960763931274,
"learning_rate": 2.2135383508179482e-05,
"loss": 0.028,
"num_input_tokens_seen": 618492160,
"step": 604000
},
{
"epoch": 5.577545879812883,
"grad_norm": 1.9128954410552979,
"learning_rate": 2.2112316734482982e-05,
"loss": 0.0276,
"num_input_tokens_seen": 619004160,
"step": 604500
},
{
"epoch": 5.582159234552182,
"grad_norm": 1.13468337059021,
"learning_rate": 2.2089249960786483e-05,
"loss": 0.027,
"num_input_tokens_seen": 619516160,
"step": 605000
},
{
"epoch": 5.586772589291481,
"grad_norm": 1.4375085830688477,
"learning_rate": 2.206618318708999e-05,
"loss": 0.0232,
"num_input_tokens_seen": 620028160,
"step": 605500
},
{
"epoch": 5.5913859440307805,
"grad_norm": 0.722649872303009,
"learning_rate": 2.204311641339349e-05,
"loss": 0.0259,
"num_input_tokens_seen": 620540160,
"step": 606000
},
{
"epoch": 5.59599929877008,
"grad_norm": 0.8669957518577576,
"learning_rate": 2.2020049639696995e-05,
"loss": 0.0262,
"num_input_tokens_seen": 621052160,
"step": 606500
},
{
"epoch": 5.600612653509379,
"grad_norm": 0.8053223490715027,
"learning_rate": 2.19969828660005e-05,
"loss": 0.0302,
"num_input_tokens_seen": 621564160,
"step": 607000
},
{
"epoch": 5.605226008248678,
"grad_norm": 1.0647988319396973,
"learning_rate": 2.1973916092304003e-05,
"loss": 0.0263,
"num_input_tokens_seen": 622076160,
"step": 607500
},
{
"epoch": 5.6098393629879775,
"grad_norm": 1.0449702739715576,
"learning_rate": 2.1950849318607507e-05,
"loss": 0.0292,
"num_input_tokens_seen": 622588160,
"step": 608000
},
{
"epoch": 5.614452717727277,
"grad_norm": 0.8551065921783447,
"learning_rate": 2.192778254491101e-05,
"loss": 0.026,
"num_input_tokens_seen": 623100160,
"step": 608500
},
{
"epoch": 5.619066072466576,
"grad_norm": 0.9317313432693481,
"learning_rate": 2.190471577121451e-05,
"loss": 0.0273,
"num_input_tokens_seen": 623612160,
"step": 609000
},
{
"epoch": 5.623679427205875,
"grad_norm": 1.1779793500900269,
"learning_rate": 2.188164899751802e-05,
"loss": 0.0267,
"num_input_tokens_seen": 624124160,
"step": 609500
},
{
"epoch": 5.628292781945175,
"grad_norm": 0.7221566438674927,
"learning_rate": 2.185858222382152e-05,
"loss": 0.0263,
"num_input_tokens_seen": 624636160,
"step": 610000
},
{
"epoch": 5.632906136684475,
"grad_norm": 1.5405559539794922,
"learning_rate": 2.1835515450125023e-05,
"loss": 0.0242,
"num_input_tokens_seen": 625148160,
"step": 610500
},
{
"epoch": 5.637519491423774,
"grad_norm": 1.2586696147918701,
"learning_rate": 2.1812448676428527e-05,
"loss": 0.0259,
"num_input_tokens_seen": 625660160,
"step": 611000
},
{
"epoch": 5.642132846163073,
"grad_norm": 1.4537557363510132,
"learning_rate": 2.1789381902732028e-05,
"loss": 0.0254,
"num_input_tokens_seen": 626172160,
"step": 611500
},
{
"epoch": 5.646746200902372,
"grad_norm": 0.7319709658622742,
"learning_rate": 2.1766315129035532e-05,
"loss": 0.0286,
"num_input_tokens_seen": 626684160,
"step": 612000
},
{
"epoch": 5.651359555641672,
"grad_norm": 0.6492053866386414,
"learning_rate": 2.1743248355339036e-05,
"loss": 0.0266,
"num_input_tokens_seen": 627196160,
"step": 612500
},
{
"epoch": 5.655972910380971,
"grad_norm": 1.0684195756912231,
"learning_rate": 2.172018158164254e-05,
"loss": 0.0239,
"num_input_tokens_seen": 627708160,
"step": 613000
},
{
"epoch": 5.66058626512027,
"grad_norm": 1.018306851387024,
"learning_rate": 2.1697114807946044e-05,
"loss": 0.027,
"num_input_tokens_seen": 628220160,
"step": 613500
},
{
"epoch": 5.665199619859569,
"grad_norm": 0.5089601278305054,
"learning_rate": 2.1674048034249548e-05,
"loss": 0.0258,
"num_input_tokens_seen": 628732160,
"step": 614000
},
{
"epoch": 5.669812974598869,
"grad_norm": 1.606461763381958,
"learning_rate": 2.1650981260553048e-05,
"loss": 0.0267,
"num_input_tokens_seen": 629244160,
"step": 614500
},
{
"epoch": 5.674426329338168,
"grad_norm": 1.479805588722229,
"learning_rate": 2.1627914486856556e-05,
"loss": 0.0266,
"num_input_tokens_seen": 629756160,
"step": 615000
},
{
"epoch": 5.679039684077467,
"grad_norm": 2.971240758895874,
"learning_rate": 2.1604847713160056e-05,
"loss": 0.0285,
"num_input_tokens_seen": 630268160,
"step": 615500
},
{
"epoch": 5.683653038816766,
"grad_norm": 0.5969455242156982,
"learning_rate": 2.158178093946356e-05,
"loss": 0.0265,
"num_input_tokens_seen": 630780160,
"step": 616000
},
{
"epoch": 5.6882663935560664,
"grad_norm": 0.7076913118362427,
"learning_rate": 2.1558714165767064e-05,
"loss": 0.024,
"num_input_tokens_seen": 631292160,
"step": 616500
},
{
"epoch": 5.692879748295366,
"grad_norm": 0.8780455589294434,
"learning_rate": 2.1535647392070568e-05,
"loss": 0.0253,
"num_input_tokens_seen": 631804160,
"step": 617000
},
{
"epoch": 5.697493103034665,
"grad_norm": 3.569014549255371,
"learning_rate": 2.151258061837407e-05,
"loss": 0.0252,
"num_input_tokens_seen": 632316160,
"step": 617500
},
{
"epoch": 5.702106457773964,
"grad_norm": 0.9523796439170837,
"learning_rate": 2.1489513844677573e-05,
"loss": 0.0275,
"num_input_tokens_seen": 632828160,
"step": 618000
},
{
"epoch": 5.706719812513263,
"grad_norm": 0.6151872873306274,
"learning_rate": 2.1466447070981077e-05,
"loss": 0.0272,
"num_input_tokens_seen": 633340160,
"step": 618500
},
{
"epoch": 5.711333167252563,
"grad_norm": 4.095676422119141,
"learning_rate": 2.144338029728458e-05,
"loss": 0.0309,
"num_input_tokens_seen": 633852160,
"step": 619000
},
{
"epoch": 5.715946521991862,
"grad_norm": 1.5436087846755981,
"learning_rate": 2.1420313523588085e-05,
"loss": 0.0237,
"num_input_tokens_seen": 634364160,
"step": 619500
},
{
"epoch": 5.720559876731161,
"grad_norm": 0.722958505153656,
"learning_rate": 2.1397246749891585e-05,
"loss": 0.024,
"num_input_tokens_seen": 634876160,
"step": 620000
},
{
"epoch": 5.72517323147046,
"grad_norm": 1.9889734983444214,
"learning_rate": 2.1374179976195092e-05,
"loss": 0.026,
"num_input_tokens_seen": 635388160,
"step": 620500
},
{
"epoch": 5.72978658620976,
"grad_norm": 1.8848015069961548,
"learning_rate": 2.1351113202498593e-05,
"loss": 0.0295,
"num_input_tokens_seen": 635900160,
"step": 621000
},
{
"epoch": 5.734399940949059,
"grad_norm": 1.4463508129119873,
"learning_rate": 2.1328046428802097e-05,
"loss": 0.0286,
"num_input_tokens_seen": 636412160,
"step": 621500
},
{
"epoch": 5.739013295688359,
"grad_norm": 2.2826876640319824,
"learning_rate": 2.13049796551056e-05,
"loss": 0.0278,
"num_input_tokens_seen": 636924160,
"step": 622000
},
{
"epoch": 5.743626650427658,
"grad_norm": 0.8323870897293091,
"learning_rate": 2.1281912881409105e-05,
"loss": 0.0247,
"num_input_tokens_seen": 637436160,
"step": 622500
},
{
"epoch": 5.7482400051669575,
"grad_norm": 1.4278696775436401,
"learning_rate": 2.1258846107712606e-05,
"loss": 0.0269,
"num_input_tokens_seen": 637948160,
"step": 623000
},
{
"epoch": 5.752853359906257,
"grad_norm": 0.425340473651886,
"learning_rate": 2.1235779334016113e-05,
"loss": 0.0263,
"num_input_tokens_seen": 638460160,
"step": 623500
},
{
"epoch": 5.757466714645556,
"grad_norm": 0.6665620803833008,
"learning_rate": 2.1212712560319614e-05,
"loss": 0.0286,
"num_input_tokens_seen": 638972160,
"step": 624000
},
{
"epoch": 5.762080069384855,
"grad_norm": 1.1083565950393677,
"learning_rate": 2.1189645786623117e-05,
"loss": 0.0251,
"num_input_tokens_seen": 639484160,
"step": 624500
},
{
"epoch": 5.7666934241241545,
"grad_norm": 1.5361641645431519,
"learning_rate": 2.116657901292662e-05,
"loss": 0.0238,
"num_input_tokens_seen": 639996160,
"step": 625000
},
{
"epoch": 5.771306778863454,
"grad_norm": 1.897976040840149,
"learning_rate": 2.1143512239230122e-05,
"loss": 0.0252,
"num_input_tokens_seen": 640508160,
"step": 625500
},
{
"epoch": 5.775920133602753,
"grad_norm": 1.181335687637329,
"learning_rate": 2.112044546553363e-05,
"loss": 0.0274,
"num_input_tokens_seen": 641020160,
"step": 626000
},
{
"epoch": 5.780533488342052,
"grad_norm": 1.2350566387176514,
"learning_rate": 2.109737869183713e-05,
"loss": 0.0269,
"num_input_tokens_seen": 641532160,
"step": 626500
},
{
"epoch": 5.7851468430813515,
"grad_norm": 0.9288113713264465,
"learning_rate": 2.1074311918140634e-05,
"loss": 0.0266,
"num_input_tokens_seen": 642044160,
"step": 627000
},
{
"epoch": 5.789760197820652,
"grad_norm": 1.3695634603500366,
"learning_rate": 2.1051245144444138e-05,
"loss": 0.0281,
"num_input_tokens_seen": 642556160,
"step": 627500
},
{
"epoch": 5.794373552559951,
"grad_norm": 1.5921497344970703,
"learning_rate": 2.1028178370747642e-05,
"loss": 0.0271,
"num_input_tokens_seen": 643068160,
"step": 628000
},
{
"epoch": 5.79898690729925,
"grad_norm": 0.9547250866889954,
"learning_rate": 2.1005111597051146e-05,
"loss": 0.0246,
"num_input_tokens_seen": 643580160,
"step": 628500
},
{
"epoch": 5.803600262038549,
"grad_norm": 0.702260434627533,
"learning_rate": 2.098204482335465e-05,
"loss": 0.0286,
"num_input_tokens_seen": 644092160,
"step": 629000
},
{
"epoch": 5.808213616777849,
"grad_norm": 1.7382519245147705,
"learning_rate": 2.095897804965815e-05,
"loss": 0.0247,
"num_input_tokens_seen": 644604160,
"step": 629500
},
{
"epoch": 5.812826971517148,
"grad_norm": 0.724609911441803,
"learning_rate": 2.0935911275961654e-05,
"loss": 0.0266,
"num_input_tokens_seen": 645116160,
"step": 630000
},
{
"epoch": 5.817440326256447,
"grad_norm": 0.8976930379867554,
"learning_rate": 2.091284450226516e-05,
"loss": 0.0261,
"num_input_tokens_seen": 645628160,
"step": 630500
},
{
"epoch": 5.822053680995746,
"grad_norm": 2.6822431087493896,
"learning_rate": 2.088977772856866e-05,
"loss": 0.0269,
"num_input_tokens_seen": 646140160,
"step": 631000
},
{
"epoch": 5.826667035735046,
"grad_norm": 0.9543342590332031,
"learning_rate": 2.0866710954872166e-05,
"loss": 0.0255,
"num_input_tokens_seen": 646652160,
"step": 631500
},
{
"epoch": 5.831280390474345,
"grad_norm": 1.0366599559783936,
"learning_rate": 2.0843644181175667e-05,
"loss": 0.0265,
"num_input_tokens_seen": 647164160,
"step": 632000
},
{
"epoch": 5.835893745213644,
"grad_norm": 2.613006830215454,
"learning_rate": 2.082057740747917e-05,
"loss": 0.0264,
"num_input_tokens_seen": 647676160,
"step": 632500
},
{
"epoch": 5.840507099952944,
"grad_norm": 0.2824631631374359,
"learning_rate": 2.0797510633782675e-05,
"loss": 0.0288,
"num_input_tokens_seen": 648188160,
"step": 633000
},
{
"epoch": 5.845120454692243,
"grad_norm": 3.399728298187256,
"learning_rate": 2.077444386008618e-05,
"loss": 0.0268,
"num_input_tokens_seen": 648700160,
"step": 633500
},
{
"epoch": 5.849733809431543,
"grad_norm": 0.7402966022491455,
"learning_rate": 2.0751377086389683e-05,
"loss": 0.0248,
"num_input_tokens_seen": 649212160,
"step": 634000
},
{
"epoch": 5.854347164170842,
"grad_norm": 0.7553480267524719,
"learning_rate": 2.0728310312693187e-05,
"loss": 0.0277,
"num_input_tokens_seen": 649724160,
"step": 634500
},
{
"epoch": 5.858960518910141,
"grad_norm": 3.4398159980773926,
"learning_rate": 2.0705243538996687e-05,
"loss": 0.0266,
"num_input_tokens_seen": 650236160,
"step": 635000
},
{
"epoch": 5.8635738736494405,
"grad_norm": 0.5711115598678589,
"learning_rate": 2.0682176765300195e-05,
"loss": 0.0241,
"num_input_tokens_seen": 650748160,
"step": 635500
},
{
"epoch": 5.86818722838874,
"grad_norm": 0.7952388525009155,
"learning_rate": 2.0659109991603695e-05,
"loss": 0.0275,
"num_input_tokens_seen": 651260160,
"step": 636000
},
{
"epoch": 5.872800583128039,
"grad_norm": 1.0399372577667236,
"learning_rate": 2.06360432179072e-05,
"loss": 0.0248,
"num_input_tokens_seen": 651772160,
"step": 636500
},
{
"epoch": 5.877413937867338,
"grad_norm": 1.6778496503829956,
"learning_rate": 2.0612976444210703e-05,
"loss": 0.0248,
"num_input_tokens_seen": 652284160,
"step": 637000
},
{
"epoch": 5.8820272926066375,
"grad_norm": 1.3442925214767456,
"learning_rate": 2.0589909670514204e-05,
"loss": 0.0271,
"num_input_tokens_seen": 652796160,
"step": 637500
},
{
"epoch": 5.886640647345937,
"grad_norm": 1.1822031736373901,
"learning_rate": 2.0566842896817708e-05,
"loss": 0.0256,
"num_input_tokens_seen": 653308160,
"step": 638000
},
{
"epoch": 5.891254002085236,
"grad_norm": 1.5322853326797485,
"learning_rate": 2.0543776123121212e-05,
"loss": 0.0269,
"num_input_tokens_seen": 653820160,
"step": 638500
},
{
"epoch": 5.895867356824535,
"grad_norm": 1.6025440692901611,
"learning_rate": 2.0520709349424716e-05,
"loss": 0.0281,
"num_input_tokens_seen": 654332160,
"step": 639000
},
{
"epoch": 5.900480711563835,
"grad_norm": 0.7516422867774963,
"learning_rate": 2.049764257572822e-05,
"loss": 0.0293,
"num_input_tokens_seen": 654844160,
"step": 639500
},
{
"epoch": 5.905094066303135,
"grad_norm": 0.7684640884399414,
"learning_rate": 2.0474575802031724e-05,
"loss": 0.0258,
"num_input_tokens_seen": 655356160,
"step": 640000
},
{
"epoch": 5.909707421042434,
"grad_norm": 1.2843828201293945,
"learning_rate": 2.0451509028335224e-05,
"loss": 0.0252,
"num_input_tokens_seen": 655868160,
"step": 640500
},
{
"epoch": 5.914320775781733,
"grad_norm": 1.0203999280929565,
"learning_rate": 2.042844225463873e-05,
"loss": 0.0268,
"num_input_tokens_seen": 656380160,
"step": 641000
},
{
"epoch": 5.918934130521032,
"grad_norm": 2.00242280960083,
"learning_rate": 2.0405375480942232e-05,
"loss": 0.0285,
"num_input_tokens_seen": 656892160,
"step": 641500
},
{
"epoch": 5.923547485260332,
"grad_norm": 1.0357120037078857,
"learning_rate": 2.0382308707245736e-05,
"loss": 0.0239,
"num_input_tokens_seen": 657404160,
"step": 642000
},
{
"epoch": 5.928160839999631,
"grad_norm": 1.1826400756835938,
"learning_rate": 2.035924193354924e-05,
"loss": 0.0268,
"num_input_tokens_seen": 657916160,
"step": 642500
},
{
"epoch": 5.93277419473893,
"grad_norm": 1.5662238597869873,
"learning_rate": 2.0336175159852744e-05,
"loss": 0.0259,
"num_input_tokens_seen": 658428160,
"step": 643000
},
{
"epoch": 5.937387549478229,
"grad_norm": 3.335893392562866,
"learning_rate": 2.0313108386156245e-05,
"loss": 0.0273,
"num_input_tokens_seen": 658940160,
"step": 643500
},
{
"epoch": 5.942000904217529,
"grad_norm": 0.7126489281654358,
"learning_rate": 2.029004161245975e-05,
"loss": 0.0268,
"num_input_tokens_seen": 659452160,
"step": 644000
},
{
"epoch": 5.946614258956828,
"grad_norm": 1.0062040090560913,
"learning_rate": 2.0266974838763253e-05,
"loss": 0.0276,
"num_input_tokens_seen": 659964160,
"step": 644500
},
{
"epoch": 5.951227613696128,
"grad_norm": 1.2691099643707275,
"learning_rate": 2.0243908065066757e-05,
"loss": 0.0295,
"num_input_tokens_seen": 660476160,
"step": 645000
},
{
"epoch": 5.955840968435427,
"grad_norm": 0.9768707752227783,
"learning_rate": 2.022084129137026e-05,
"loss": 0.0287,
"num_input_tokens_seen": 660988160,
"step": 645500
},
{
"epoch": 5.9604543231747265,
"grad_norm": 1.5846303701400757,
"learning_rate": 2.019777451767376e-05,
"loss": 0.028,
"num_input_tokens_seen": 661500160,
"step": 646000
},
{
"epoch": 5.965067677914026,
"grad_norm": 0.556376576423645,
"learning_rate": 2.017470774397727e-05,
"loss": 0.029,
"num_input_tokens_seen": 662012160,
"step": 646500
},
{
"epoch": 5.969681032653325,
"grad_norm": 1.8407984972000122,
"learning_rate": 2.015164097028077e-05,
"loss": 0.0278,
"num_input_tokens_seen": 662524160,
"step": 647000
},
{
"epoch": 5.974294387392624,
"grad_norm": 2.419261932373047,
"learning_rate": 2.0128574196584273e-05,
"loss": 0.0264,
"num_input_tokens_seen": 663036160,
"step": 647500
},
{
"epoch": 5.978907742131923,
"grad_norm": 1.3140838146209717,
"learning_rate": 2.0105507422887777e-05,
"loss": 0.0269,
"num_input_tokens_seen": 663548160,
"step": 648000
},
{
"epoch": 5.983521096871223,
"grad_norm": 1.3511277437210083,
"learning_rate": 2.008244064919128e-05,
"loss": 0.0258,
"num_input_tokens_seen": 664060160,
"step": 648500
},
{
"epoch": 5.988134451610522,
"grad_norm": 0.9623832106590271,
"learning_rate": 2.005937387549478e-05,
"loss": 0.0258,
"num_input_tokens_seen": 664572160,
"step": 649000
},
{
"epoch": 5.992747806349821,
"grad_norm": 1.2604849338531494,
"learning_rate": 2.003630710179829e-05,
"loss": 0.0257,
"num_input_tokens_seen": 665084160,
"step": 649500
},
{
"epoch": 5.99736116108912,
"grad_norm": 0.5637773871421814,
"learning_rate": 2.001324032810179e-05,
"loss": 0.0276,
"num_input_tokens_seen": 665596160,
"step": 650000
},
{
"epoch": 6.0,
"eval_combined_score": 0.06719425867896905,
"eval_loss": 0.06719426065683365,
"eval_mse": 0.06719425670110447,
"eval_runtime": 46.0502,
"eval_samples_per_second": 2092.023,
"eval_steps_per_second": 261.519,
"num_input_tokens_seen": 665888256,
"step": 650286
},
{
"epoch": 6.00197451582842,
"grad_norm": 0.7754026055335999,
"learning_rate": 1.9990173554405293e-05,
"loss": 0.0243,
"num_input_tokens_seen": 666107392,
"step": 650500
},
{
"epoch": 6.00658787056772,
"grad_norm": 3.4056851863861084,
"learning_rate": 1.9967106780708797e-05,
"loss": 0.0198,
"num_input_tokens_seen": 666619392,
"step": 651000
},
{
"epoch": 6.011201225307019,
"grad_norm": 0.7338670492172241,
"learning_rate": 1.9944040007012298e-05,
"loss": 0.0194,
"num_input_tokens_seen": 667131392,
"step": 651500
},
{
"epoch": 6.015814580046318,
"grad_norm": 0.9775220155715942,
"learning_rate": 1.9920973233315805e-05,
"loss": 0.0218,
"num_input_tokens_seen": 667643392,
"step": 652000
},
{
"epoch": 6.0204279347856176,
"grad_norm": 0.6513090133666992,
"learning_rate": 1.9897906459619306e-05,
"loss": 0.0225,
"num_input_tokens_seen": 668155392,
"step": 652500
},
{
"epoch": 6.025041289524917,
"grad_norm": 1.0997514724731445,
"learning_rate": 1.987483968592281e-05,
"loss": 0.0218,
"num_input_tokens_seen": 668667392,
"step": 653000
},
{
"epoch": 6.029654644264216,
"grad_norm": 1.8776363134384155,
"learning_rate": 1.9851772912226314e-05,
"loss": 0.0232,
"num_input_tokens_seen": 669179392,
"step": 653500
},
{
"epoch": 6.034267999003515,
"grad_norm": 1.0117559432983398,
"learning_rate": 1.9828706138529818e-05,
"loss": 0.0195,
"num_input_tokens_seen": 669691392,
"step": 654000
},
{
"epoch": 6.0388813537428145,
"grad_norm": 1.839374303817749,
"learning_rate": 1.980563936483332e-05,
"loss": 0.0206,
"num_input_tokens_seen": 670203392,
"step": 654500
},
{
"epoch": 6.043494708482114,
"grad_norm": 1.1383150815963745,
"learning_rate": 1.9782572591136826e-05,
"loss": 0.02,
"num_input_tokens_seen": 670715392,
"step": 655000
},
{
"epoch": 6.048108063221413,
"grad_norm": 0.6940335631370544,
"learning_rate": 1.9759505817440326e-05,
"loss": 0.0218,
"num_input_tokens_seen": 671227392,
"step": 655500
},
{
"epoch": 6.052721417960712,
"grad_norm": 0.9437240958213806,
"learning_rate": 1.973643904374383e-05,
"loss": 0.0214,
"num_input_tokens_seen": 671739392,
"step": 656000
},
{
"epoch": 6.057334772700012,
"grad_norm": 1.297887921333313,
"learning_rate": 1.9713372270047334e-05,
"loss": 0.0197,
"num_input_tokens_seen": 672251392,
"step": 656500
},
{
"epoch": 6.061948127439312,
"grad_norm": 1.1121424436569214,
"learning_rate": 1.9690305496350835e-05,
"loss": 0.0228,
"num_input_tokens_seen": 672763392,
"step": 657000
},
{
"epoch": 6.066561482178611,
"grad_norm": 1.2576148509979248,
"learning_rate": 1.9667238722654342e-05,
"loss": 0.0202,
"num_input_tokens_seen": 673275392,
"step": 657500
},
{
"epoch": 6.07117483691791,
"grad_norm": 0.9484318494796753,
"learning_rate": 1.9644171948957843e-05,
"loss": 0.0208,
"num_input_tokens_seen": 673787392,
"step": 658000
},
{
"epoch": 6.075788191657209,
"grad_norm": 1.5170820951461792,
"learning_rate": 1.9621105175261347e-05,
"loss": 0.0216,
"num_input_tokens_seen": 674299392,
"step": 658500
},
{
"epoch": 6.080401546396509,
"grad_norm": 1.5162551403045654,
"learning_rate": 1.959803840156485e-05,
"loss": 0.0209,
"num_input_tokens_seen": 674811392,
"step": 659000
},
{
"epoch": 6.085014901135808,
"grad_norm": 1.1097129583358765,
"learning_rate": 1.9574971627868355e-05,
"loss": 0.0211,
"num_input_tokens_seen": 675323392,
"step": 659500
},
{
"epoch": 6.089628255875107,
"grad_norm": 1.9856687784194946,
"learning_rate": 1.9551904854171855e-05,
"loss": 0.0205,
"num_input_tokens_seen": 675835392,
"step": 660000
},
{
"epoch": 6.094241610614406,
"grad_norm": 0.447665810585022,
"learning_rate": 1.9528838080475363e-05,
"loss": 0.0215,
"num_input_tokens_seen": 676347392,
"step": 660500
},
{
"epoch": 6.098854965353706,
"grad_norm": 0.6140983700752258,
"learning_rate": 1.9505771306778863e-05,
"loss": 0.0204,
"num_input_tokens_seen": 676859392,
"step": 661000
},
{
"epoch": 6.103468320093005,
"grad_norm": 0.6753659844398499,
"learning_rate": 1.9482704533082367e-05,
"loss": 0.0234,
"num_input_tokens_seen": 677371392,
"step": 661500
},
{
"epoch": 6.108081674832304,
"grad_norm": 0.5752419233322144,
"learning_rate": 1.945963775938587e-05,
"loss": 0.0202,
"num_input_tokens_seen": 677883392,
"step": 662000
},
{
"epoch": 6.112695029571604,
"grad_norm": 0.8498187065124512,
"learning_rate": 1.9436570985689375e-05,
"loss": 0.0207,
"num_input_tokens_seen": 678395392,
"step": 662500
},
{
"epoch": 6.1173083843109035,
"grad_norm": 0.8756592273712158,
"learning_rate": 1.941350421199288e-05,
"loss": 0.0193,
"num_input_tokens_seen": 678907392,
"step": 663000
},
{
"epoch": 6.121921739050203,
"grad_norm": 2.693408250808716,
"learning_rate": 1.939043743829638e-05,
"loss": 0.0192,
"num_input_tokens_seen": 679419392,
"step": 663500
},
{
"epoch": 6.126535093789502,
"grad_norm": 1.2562410831451416,
"learning_rate": 1.9367370664599884e-05,
"loss": 0.0214,
"num_input_tokens_seen": 679931392,
"step": 664000
},
{
"epoch": 6.131148448528801,
"grad_norm": 1.662607192993164,
"learning_rate": 1.9344303890903388e-05,
"loss": 0.0202,
"num_input_tokens_seen": 680443392,
"step": 664500
},
{
"epoch": 6.1357618032681005,
"grad_norm": 0.8095691800117493,
"learning_rate": 1.932123711720689e-05,
"loss": 0.0212,
"num_input_tokens_seen": 680955392,
"step": 665000
},
{
"epoch": 6.1403751580074,
"grad_norm": 0.5978444218635559,
"learning_rate": 1.9298170343510392e-05,
"loss": 0.0211,
"num_input_tokens_seen": 681467392,
"step": 665500
},
{
"epoch": 6.144988512746699,
"grad_norm": 0.5060915946960449,
"learning_rate": 1.92751035698139e-05,
"loss": 0.0227,
"num_input_tokens_seen": 681979392,
"step": 666000
},
{
"epoch": 6.149601867485998,
"grad_norm": 0.9484182596206665,
"learning_rate": 1.92520367961174e-05,
"loss": 0.0203,
"num_input_tokens_seen": 682491392,
"step": 666500
},
{
"epoch": 6.1542152222252975,
"grad_norm": 1.3608324527740479,
"learning_rate": 1.9228970022420904e-05,
"loss": 0.0222,
"num_input_tokens_seen": 683003392,
"step": 667000
},
{
"epoch": 6.158828576964597,
"grad_norm": 0.9933167099952698,
"learning_rate": 1.9205903248724408e-05,
"loss": 0.0193,
"num_input_tokens_seen": 683515392,
"step": 667500
},
{
"epoch": 6.163441931703897,
"grad_norm": 1.8458038568496704,
"learning_rate": 1.9182836475027912e-05,
"loss": 0.0208,
"num_input_tokens_seen": 684027392,
"step": 668000
},
{
"epoch": 6.168055286443196,
"grad_norm": 0.9922088384628296,
"learning_rate": 1.9159769701331416e-05,
"loss": 0.0227,
"num_input_tokens_seen": 684539392,
"step": 668500
},
{
"epoch": 6.172668641182495,
"grad_norm": 0.7523616552352905,
"learning_rate": 1.913670292763492e-05,
"loss": 0.0214,
"num_input_tokens_seen": 685051392,
"step": 669000
},
{
"epoch": 6.177281995921795,
"grad_norm": 1.4571471214294434,
"learning_rate": 1.911363615393842e-05,
"loss": 0.0213,
"num_input_tokens_seen": 685563392,
"step": 669500
},
{
"epoch": 6.181895350661094,
"grad_norm": 1.6645666360855103,
"learning_rate": 1.9090569380241925e-05,
"loss": 0.0231,
"num_input_tokens_seen": 686075392,
"step": 670000
},
{
"epoch": 6.186508705400393,
"grad_norm": 0.5746430158615112,
"learning_rate": 1.906750260654543e-05,
"loss": 0.0214,
"num_input_tokens_seen": 686587392,
"step": 670500
},
{
"epoch": 6.191122060139692,
"grad_norm": 0.6545117497444153,
"learning_rate": 1.9044435832848933e-05,
"loss": 0.0203,
"num_input_tokens_seen": 687099392,
"step": 671000
},
{
"epoch": 6.195735414878992,
"grad_norm": 0.6282312273979187,
"learning_rate": 1.9021369059152436e-05,
"loss": 0.0222,
"num_input_tokens_seen": 687611392,
"step": 671500
},
{
"epoch": 6.200348769618291,
"grad_norm": 0.7718172073364258,
"learning_rate": 1.8998302285455937e-05,
"loss": 0.0211,
"num_input_tokens_seen": 688123392,
"step": 672000
},
{
"epoch": 6.20496212435759,
"grad_norm": 1.4277899265289307,
"learning_rate": 1.897523551175944e-05,
"loss": 0.0207,
"num_input_tokens_seen": 688635392,
"step": 672500
},
{
"epoch": 6.209575479096889,
"grad_norm": 0.5869673490524292,
"learning_rate": 1.8952168738062945e-05,
"loss": 0.0201,
"num_input_tokens_seen": 689147392,
"step": 673000
},
{
"epoch": 6.214188833836189,
"grad_norm": 0.7148327231407166,
"learning_rate": 1.892910196436645e-05,
"loss": 0.0217,
"num_input_tokens_seen": 689659392,
"step": 673500
},
{
"epoch": 6.218802188575489,
"grad_norm": 1.9917762279510498,
"learning_rate": 1.8906035190669953e-05,
"loss": 0.0201,
"num_input_tokens_seen": 690171392,
"step": 674000
},
{
"epoch": 6.223415543314788,
"grad_norm": 1.030920386314392,
"learning_rate": 1.8882968416973457e-05,
"loss": 0.0217,
"num_input_tokens_seen": 690683392,
"step": 674500
},
{
"epoch": 6.228028898054087,
"grad_norm": 0.6258344054222107,
"learning_rate": 1.8859901643276958e-05,
"loss": 0.0205,
"num_input_tokens_seen": 691195392,
"step": 675000
},
{
"epoch": 6.2326422527933865,
"grad_norm": 2.0319483280181885,
"learning_rate": 1.8836834869580465e-05,
"loss": 0.0208,
"num_input_tokens_seen": 691707392,
"step": 675500
},
{
"epoch": 6.237255607532686,
"grad_norm": 0.5357654094696045,
"learning_rate": 1.8813768095883965e-05,
"loss": 0.021,
"num_input_tokens_seen": 692219392,
"step": 676000
},
{
"epoch": 6.241868962271985,
"grad_norm": 2.2843759059906006,
"learning_rate": 1.879070132218747e-05,
"loss": 0.0224,
"num_input_tokens_seen": 692731392,
"step": 676500
},
{
"epoch": 6.246482317011284,
"grad_norm": 0.7464880347251892,
"learning_rate": 1.8767634548490973e-05,
"loss": 0.0204,
"num_input_tokens_seen": 693243392,
"step": 677000
},
{
"epoch": 6.2510956717505834,
"grad_norm": 1.1594797372817993,
"learning_rate": 1.8744567774794474e-05,
"loss": 0.0219,
"num_input_tokens_seen": 693755392,
"step": 677500
},
{
"epoch": 6.255709026489883,
"grad_norm": 2.049744129180908,
"learning_rate": 1.872150100109798e-05,
"loss": 0.0189,
"num_input_tokens_seen": 694267392,
"step": 678000
},
{
"epoch": 6.260322381229182,
"grad_norm": 2.227196216583252,
"learning_rate": 1.8698434227401482e-05,
"loss": 0.0215,
"num_input_tokens_seen": 694779392,
"step": 678500
},
{
"epoch": 6.264935735968481,
"grad_norm": 1.209151268005371,
"learning_rate": 1.8675367453704986e-05,
"loss": 0.0207,
"num_input_tokens_seen": 695291392,
"step": 679000
},
{
"epoch": 6.26954909070778,
"grad_norm": 0.6479954123497009,
"learning_rate": 1.865230068000849e-05,
"loss": 0.0229,
"num_input_tokens_seen": 695803392,
"step": 679500
},
{
"epoch": 6.274162445447081,
"grad_norm": 0.5225302577018738,
"learning_rate": 1.8629233906311994e-05,
"loss": 0.0207,
"num_input_tokens_seen": 696315392,
"step": 680000
},
{
"epoch": 6.27877580018638,
"grad_norm": 0.8142069578170776,
"learning_rate": 1.8606167132615494e-05,
"loss": 0.0242,
"num_input_tokens_seen": 696827392,
"step": 680500
},
{
"epoch": 6.283389154925679,
"grad_norm": 2.5518014430999756,
"learning_rate": 1.8583100358919002e-05,
"loss": 0.0221,
"num_input_tokens_seen": 697339392,
"step": 681000
},
{
"epoch": 6.288002509664978,
"grad_norm": 0.609211266040802,
"learning_rate": 1.8560033585222502e-05,
"loss": 0.0238,
"num_input_tokens_seen": 697851392,
"step": 681500
},
{
"epoch": 6.292615864404278,
"grad_norm": 0.6666821837425232,
"learning_rate": 1.8536966811526006e-05,
"loss": 0.0235,
"num_input_tokens_seen": 698363392,
"step": 682000
},
{
"epoch": 6.297229219143577,
"grad_norm": 2.551591396331787,
"learning_rate": 1.851390003782951e-05,
"loss": 0.0233,
"num_input_tokens_seen": 698875392,
"step": 682500
},
{
"epoch": 6.301842573882876,
"grad_norm": 1.171808123588562,
"learning_rate": 1.849083326413301e-05,
"loss": 0.0211,
"num_input_tokens_seen": 699387392,
"step": 683000
},
{
"epoch": 6.306455928622175,
"grad_norm": 1.9758840799331665,
"learning_rate": 1.8467766490436518e-05,
"loss": 0.0212,
"num_input_tokens_seen": 699899392,
"step": 683500
},
{
"epoch": 6.3110692833614745,
"grad_norm": 0.7469502091407776,
"learning_rate": 1.844469971674002e-05,
"loss": 0.0217,
"num_input_tokens_seen": 700411392,
"step": 684000
},
{
"epoch": 6.315682638100774,
"grad_norm": 0.9809781908988953,
"learning_rate": 1.8421632943043523e-05,
"loss": 0.0229,
"num_input_tokens_seen": 700923392,
"step": 684500
},
{
"epoch": 6.320295992840073,
"grad_norm": 0.9586873650550842,
"learning_rate": 1.8398566169347027e-05,
"loss": 0.021,
"num_input_tokens_seen": 701435392,
"step": 685000
},
{
"epoch": 6.324909347579373,
"grad_norm": 8.868587493896484,
"learning_rate": 1.837549939565053e-05,
"loss": 0.0225,
"num_input_tokens_seen": 701947392,
"step": 685500
},
{
"epoch": 6.329522702318672,
"grad_norm": 1.1265676021575928,
"learning_rate": 1.835243262195403e-05,
"loss": 0.0212,
"num_input_tokens_seen": 702459392,
"step": 686000
},
{
"epoch": 6.334136057057972,
"grad_norm": 1.0341181755065918,
"learning_rate": 1.832936584825754e-05,
"loss": 0.0232,
"num_input_tokens_seen": 702971392,
"step": 686500
},
{
"epoch": 6.338749411797271,
"grad_norm": 0.3800777196884155,
"learning_rate": 1.830629907456104e-05,
"loss": 0.0213,
"num_input_tokens_seen": 703483392,
"step": 687000
},
{
"epoch": 6.34336276653657,
"grad_norm": 0.7369467616081238,
"learning_rate": 1.8283232300864543e-05,
"loss": 0.0234,
"num_input_tokens_seen": 703995392,
"step": 687500
},
{
"epoch": 6.347976121275869,
"grad_norm": 1.0980653762817383,
"learning_rate": 1.8260165527168047e-05,
"loss": 0.02,
"num_input_tokens_seen": 704507392,
"step": 688000
},
{
"epoch": 6.352589476015169,
"grad_norm": 17.581872940063477,
"learning_rate": 1.823709875347155e-05,
"loss": 0.0233,
"num_input_tokens_seen": 705019392,
"step": 688500
},
{
"epoch": 6.357202830754468,
"grad_norm": 0.5301328301429749,
"learning_rate": 1.8214031979775055e-05,
"loss": 0.0226,
"num_input_tokens_seen": 705531392,
"step": 689000
},
{
"epoch": 6.361816185493767,
"grad_norm": 0.44786104559898376,
"learning_rate": 1.8190965206078556e-05,
"loss": 0.0216,
"num_input_tokens_seen": 706043392,
"step": 689500
},
{
"epoch": 6.366429540233066,
"grad_norm": 2.587684154510498,
"learning_rate": 1.816789843238206e-05,
"loss": 0.0232,
"num_input_tokens_seen": 706555392,
"step": 690000
},
{
"epoch": 6.371042894972366,
"grad_norm": 1.0485097169876099,
"learning_rate": 1.8144831658685564e-05,
"loss": 0.0198,
"num_input_tokens_seen": 707067392,
"step": 690500
},
{
"epoch": 6.375656249711666,
"grad_norm": 0.38697299361228943,
"learning_rate": 1.8121764884989068e-05,
"loss": 0.0224,
"num_input_tokens_seen": 707579392,
"step": 691000
},
{
"epoch": 6.380269604450965,
"grad_norm": 1.7703328132629395,
"learning_rate": 1.8098698111292568e-05,
"loss": 0.0205,
"num_input_tokens_seen": 708091392,
"step": 691500
},
{
"epoch": 6.384882959190264,
"grad_norm": 0.5361246466636658,
"learning_rate": 1.8075631337596076e-05,
"loss": 0.0217,
"num_input_tokens_seen": 708603392,
"step": 692000
},
{
"epoch": 6.3894963139295635,
"grad_norm": 0.7262565493583679,
"learning_rate": 1.8052564563899576e-05,
"loss": 0.0231,
"num_input_tokens_seen": 709115392,
"step": 692500
},
{
"epoch": 6.394109668668863,
"grad_norm": 0.5426166653633118,
"learning_rate": 1.802949779020308e-05,
"loss": 0.0229,
"num_input_tokens_seen": 709627392,
"step": 693000
},
{
"epoch": 6.398723023408162,
"grad_norm": 0.9370472431182861,
"learning_rate": 1.8006431016506584e-05,
"loss": 0.0231,
"num_input_tokens_seen": 710139392,
"step": 693500
},
{
"epoch": 6.403336378147461,
"grad_norm": 1.1743369102478027,
"learning_rate": 1.7983364242810088e-05,
"loss": 0.0221,
"num_input_tokens_seen": 710651392,
"step": 694000
},
{
"epoch": 6.4079497328867605,
"grad_norm": 1.1654258966445923,
"learning_rate": 1.7960297469113592e-05,
"loss": 0.0239,
"num_input_tokens_seen": 711163392,
"step": 694500
},
{
"epoch": 6.41256308762606,
"grad_norm": 0.9082449078559875,
"learning_rate": 1.7937230695417096e-05,
"loss": 0.0206,
"num_input_tokens_seen": 711675392,
"step": 695000
},
{
"epoch": 6.417176442365359,
"grad_norm": 0.7706845998764038,
"learning_rate": 1.7914163921720597e-05,
"loss": 0.0225,
"num_input_tokens_seen": 712187392,
"step": 695500
},
{
"epoch": 6.421789797104658,
"grad_norm": 0.8697851896286011,
"learning_rate": 1.78910971480241e-05,
"loss": 0.0196,
"num_input_tokens_seen": 712699392,
"step": 696000
},
{
"epoch": 6.4264031518439575,
"grad_norm": 0.8328973054885864,
"learning_rate": 1.7868030374327605e-05,
"loss": 0.0191,
"num_input_tokens_seen": 713211392,
"step": 696500
},
{
"epoch": 6.431016506583257,
"grad_norm": 7.328830242156982,
"learning_rate": 1.7844963600631105e-05,
"loss": 0.0219,
"num_input_tokens_seen": 713723392,
"step": 697000
},
{
"epoch": 6.435629861322557,
"grad_norm": 0.9811331629753113,
"learning_rate": 1.7821896826934612e-05,
"loss": 0.0225,
"num_input_tokens_seen": 714235392,
"step": 697500
},
{
"epoch": 6.440243216061856,
"grad_norm": 2.4249658584594727,
"learning_rate": 1.7798830053238113e-05,
"loss": 0.0199,
"num_input_tokens_seen": 714747392,
"step": 698000
},
{
"epoch": 6.444856570801155,
"grad_norm": 1.6844923496246338,
"learning_rate": 1.7775763279541617e-05,
"loss": 0.0214,
"num_input_tokens_seen": 715259392,
"step": 698500
},
{
"epoch": 6.449469925540455,
"grad_norm": 2.2441189289093018,
"learning_rate": 1.775269650584512e-05,
"loss": 0.0205,
"num_input_tokens_seen": 715771392,
"step": 699000
},
{
"epoch": 6.454083280279754,
"grad_norm": 0.4577130973339081,
"learning_rate": 1.7729629732148625e-05,
"loss": 0.022,
"num_input_tokens_seen": 716283392,
"step": 699500
},
{
"epoch": 6.458696635019053,
"grad_norm": 1.2576284408569336,
"learning_rate": 1.770656295845213e-05,
"loss": 0.021,
"num_input_tokens_seen": 716795392,
"step": 700000
},
{
"epoch": 6.463309989758352,
"grad_norm": 1.3181337118148804,
"learning_rate": 1.7683496184755633e-05,
"loss": 0.0228,
"num_input_tokens_seen": 717307392,
"step": 700500
},
{
"epoch": 6.467923344497652,
"grad_norm": 0.6435089707374573,
"learning_rate": 1.7660429411059133e-05,
"loss": 0.0213,
"num_input_tokens_seen": 717819392,
"step": 701000
},
{
"epoch": 6.472536699236951,
"grad_norm": 1.2723332643508911,
"learning_rate": 1.763736263736264e-05,
"loss": 0.0197,
"num_input_tokens_seen": 718331392,
"step": 701500
},
{
"epoch": 6.47715005397625,
"grad_norm": 5.60179328918457,
"learning_rate": 1.761429586366614e-05,
"loss": 0.0246,
"num_input_tokens_seen": 718843392,
"step": 702000
},
{
"epoch": 6.481763408715549,
"grad_norm": 1.1845461130142212,
"learning_rate": 1.7591229089969642e-05,
"loss": 0.0208,
"num_input_tokens_seen": 719355392,
"step": 702500
},
{
"epoch": 6.4863767634548495,
"grad_norm": 0.9325453042984009,
"learning_rate": 1.756816231627315e-05,
"loss": 0.0214,
"num_input_tokens_seen": 719867392,
"step": 703000
},
{
"epoch": 6.490990118194149,
"grad_norm": 1.919224500656128,
"learning_rate": 1.754509554257665e-05,
"loss": 0.0208,
"num_input_tokens_seen": 720379392,
"step": 703500
},
{
"epoch": 6.495603472933448,
"grad_norm": 0.8646382093429565,
"learning_rate": 1.7522028768880154e-05,
"loss": 0.0227,
"num_input_tokens_seen": 720891392,
"step": 704000
},
{
"epoch": 6.500216827672747,
"grad_norm": 0.6728546619415283,
"learning_rate": 1.7498961995183658e-05,
"loss": 0.0214,
"num_input_tokens_seen": 721403392,
"step": 704500
},
{
"epoch": 6.5048301824120465,
"grad_norm": 1.701745629310608,
"learning_rate": 1.7475895221487162e-05,
"loss": 0.0198,
"num_input_tokens_seen": 721915392,
"step": 705000
},
{
"epoch": 6.509443537151346,
"grad_norm": 1.382514476776123,
"learning_rate": 1.7452828447790666e-05,
"loss": 0.0229,
"num_input_tokens_seen": 722427392,
"step": 705500
},
{
"epoch": 6.514056891890645,
"grad_norm": 1.366165041923523,
"learning_rate": 1.742976167409417e-05,
"loss": 0.0244,
"num_input_tokens_seen": 722939392,
"step": 706000
},
{
"epoch": 6.518670246629944,
"grad_norm": 0.727484405040741,
"learning_rate": 1.740669490039767e-05,
"loss": 0.0215,
"num_input_tokens_seen": 723451392,
"step": 706500
},
{
"epoch": 6.5232836013692435,
"grad_norm": 0.9992395043373108,
"learning_rate": 1.7383628126701178e-05,
"loss": 0.0203,
"num_input_tokens_seen": 723963392,
"step": 707000
},
{
"epoch": 6.527896956108543,
"grad_norm": 1.4681673049926758,
"learning_rate": 1.736056135300468e-05,
"loss": 0.0236,
"num_input_tokens_seen": 724475392,
"step": 707500
},
{
"epoch": 6.532510310847842,
"grad_norm": 0.6639313101768494,
"learning_rate": 1.7337494579308182e-05,
"loss": 0.0196,
"num_input_tokens_seen": 724987392,
"step": 708000
},
{
"epoch": 6.537123665587142,
"grad_norm": 1.4685230255126953,
"learning_rate": 1.7314427805611686e-05,
"loss": 0.0231,
"num_input_tokens_seen": 725499392,
"step": 708500
},
{
"epoch": 6.541737020326441,
"grad_norm": 0.711995542049408,
"learning_rate": 1.729136103191519e-05,
"loss": 0.0217,
"num_input_tokens_seen": 726011392,
"step": 709000
},
{
"epoch": 6.546350375065741,
"grad_norm": 0.849071204662323,
"learning_rate": 1.726829425821869e-05,
"loss": 0.0219,
"num_input_tokens_seen": 726523392,
"step": 709500
},
{
"epoch": 6.55096372980504,
"grad_norm": 0.7562097311019897,
"learning_rate": 1.7245227484522195e-05,
"loss": 0.0203,
"num_input_tokens_seen": 727035392,
"step": 710000
},
{
"epoch": 6.555577084544339,
"grad_norm": 1.556663155555725,
"learning_rate": 1.72221607108257e-05,
"loss": 0.0224,
"num_input_tokens_seen": 727547392,
"step": 710500
},
{
"epoch": 6.560190439283638,
"grad_norm": 3.2554850578308105,
"learning_rate": 1.7199093937129203e-05,
"loss": 0.022,
"num_input_tokens_seen": 728059392,
"step": 711000
},
{
"epoch": 6.564803794022938,
"grad_norm": 1.4903610944747925,
"learning_rate": 1.7176027163432707e-05,
"loss": 0.0204,
"num_input_tokens_seen": 728571392,
"step": 711500
},
{
"epoch": 6.569417148762237,
"grad_norm": 1.828810691833496,
"learning_rate": 1.7152960389736207e-05,
"loss": 0.0233,
"num_input_tokens_seen": 729083392,
"step": 712000
},
{
"epoch": 6.574030503501536,
"grad_norm": 0.5452165603637695,
"learning_rate": 1.7129893616039715e-05,
"loss": 0.0216,
"num_input_tokens_seen": 729595392,
"step": 712500
},
{
"epoch": 6.578643858240835,
"grad_norm": 1.4269682168960571,
"learning_rate": 1.7106826842343215e-05,
"loss": 0.0222,
"num_input_tokens_seen": 730107392,
"step": 713000
},
{
"epoch": 6.5832572129801346,
"grad_norm": 0.5227313041687012,
"learning_rate": 1.708376006864672e-05,
"loss": 0.0201,
"num_input_tokens_seen": 730619392,
"step": 713500
},
{
"epoch": 6.587870567719435,
"grad_norm": 0.8635200262069702,
"learning_rate": 1.7060693294950223e-05,
"loss": 0.0208,
"num_input_tokens_seen": 731131392,
"step": 714000
},
{
"epoch": 6.592483922458733,
"grad_norm": 1.070576548576355,
"learning_rate": 1.7037626521253727e-05,
"loss": 0.0213,
"num_input_tokens_seen": 731643392,
"step": 714500
},
{
"epoch": 6.597097277198033,
"grad_norm": 21.42013931274414,
"learning_rate": 1.7014559747557228e-05,
"loss": 0.0214,
"num_input_tokens_seen": 732155392,
"step": 715000
},
{
"epoch": 6.601710631937332,
"grad_norm": 1.3582208156585693,
"learning_rate": 1.6991492973860735e-05,
"loss": 0.0222,
"num_input_tokens_seen": 732667392,
"step": 715500
},
{
"epoch": 6.606323986676632,
"grad_norm": 1.3939865827560425,
"learning_rate": 1.6968426200164236e-05,
"loss": 0.0212,
"num_input_tokens_seen": 733179392,
"step": 716000
},
{
"epoch": 6.610937341415931,
"grad_norm": 1.0751606225967407,
"learning_rate": 1.694535942646774e-05,
"loss": 0.0225,
"num_input_tokens_seen": 733691392,
"step": 716500
},
{
"epoch": 6.61555069615523,
"grad_norm": 1.630864143371582,
"learning_rate": 1.6922292652771244e-05,
"loss": 0.0215,
"num_input_tokens_seen": 734203392,
"step": 717000
},
{
"epoch": 6.620164050894529,
"grad_norm": 0.7903428077697754,
"learning_rate": 1.6899225879074744e-05,
"loss": 0.0204,
"num_input_tokens_seen": 734715392,
"step": 717500
},
{
"epoch": 6.624777405633829,
"grad_norm": 0.9173442125320435,
"learning_rate": 1.687615910537825e-05,
"loss": 0.0205,
"num_input_tokens_seen": 735227392,
"step": 718000
},
{
"epoch": 6.629390760373128,
"grad_norm": 0.4864923059940338,
"learning_rate": 1.6853092331681752e-05,
"loss": 0.0222,
"num_input_tokens_seen": 735739392,
"step": 718500
},
{
"epoch": 6.634004115112427,
"grad_norm": 2.9184951782226562,
"learning_rate": 1.6830025557985256e-05,
"loss": 0.0213,
"num_input_tokens_seen": 736251392,
"step": 719000
},
{
"epoch": 6.638617469851726,
"grad_norm": 0.9503863453865051,
"learning_rate": 1.680695878428876e-05,
"loss": 0.0213,
"num_input_tokens_seen": 736763392,
"step": 719500
},
{
"epoch": 6.643230824591026,
"grad_norm": 1.129035234451294,
"learning_rate": 1.6783892010592264e-05,
"loss": 0.0217,
"num_input_tokens_seen": 737275392,
"step": 720000
},
{
"epoch": 6.647844179330326,
"grad_norm": 0.7650052309036255,
"learning_rate": 1.6760825236895768e-05,
"loss": 0.0209,
"num_input_tokens_seen": 737787392,
"step": 720500
},
{
"epoch": 6.652457534069625,
"grad_norm": 1.070244312286377,
"learning_rate": 1.6737758463199272e-05,
"loss": 0.0201,
"num_input_tokens_seen": 738299392,
"step": 721000
},
{
"epoch": 6.657070888808924,
"grad_norm": 1.1811015605926514,
"learning_rate": 1.6714691689502773e-05,
"loss": 0.0233,
"num_input_tokens_seen": 738811392,
"step": 721500
},
{
"epoch": 6.6616842435482235,
"grad_norm": 1.0393638610839844,
"learning_rate": 1.6691624915806277e-05,
"loss": 0.0227,
"num_input_tokens_seen": 739323392,
"step": 722000
},
{
"epoch": 6.666297598287523,
"grad_norm": 1.2030943632125854,
"learning_rate": 1.666855814210978e-05,
"loss": 0.0234,
"num_input_tokens_seen": 739835392,
"step": 722500
},
{
"epoch": 6.670910953026822,
"grad_norm": 0.676896870136261,
"learning_rate": 1.664549136841328e-05,
"loss": 0.0214,
"num_input_tokens_seen": 740347392,
"step": 723000
},
{
"epoch": 6.675524307766121,
"grad_norm": 0.9208011031150818,
"learning_rate": 1.662242459471679e-05,
"loss": 0.0235,
"num_input_tokens_seen": 740859392,
"step": 723500
},
{
"epoch": 6.6801376625054205,
"grad_norm": 0.5715643763542175,
"learning_rate": 1.659935782102029e-05,
"loss": 0.0192,
"num_input_tokens_seen": 741371392,
"step": 724000
},
{
"epoch": 6.68475101724472,
"grad_norm": 3.038097381591797,
"learning_rate": 1.6576291047323793e-05,
"loss": 0.0227,
"num_input_tokens_seen": 741883392,
"step": 724500
},
{
"epoch": 6.689364371984019,
"grad_norm": 0.7479985952377319,
"learning_rate": 1.6553224273627297e-05,
"loss": 0.022,
"num_input_tokens_seen": 742395392,
"step": 725000
},
{
"epoch": 6.693977726723318,
"grad_norm": 0.4049575626850128,
"learning_rate": 1.65301574999308e-05,
"loss": 0.0225,
"num_input_tokens_seen": 742907392,
"step": 725500
},
{
"epoch": 6.698591081462618,
"grad_norm": 1.12605881690979,
"learning_rate": 1.6507090726234305e-05,
"loss": 0.0227,
"num_input_tokens_seen": 743419392,
"step": 726000
},
{
"epoch": 6.703204436201918,
"grad_norm": 0.9142519235610962,
"learning_rate": 1.648402395253781e-05,
"loss": 0.0225,
"num_input_tokens_seen": 743931392,
"step": 726500
},
{
"epoch": 6.707817790941217,
"grad_norm": 2.4688339233398438,
"learning_rate": 1.646095717884131e-05,
"loss": 0.0231,
"num_input_tokens_seen": 744443392,
"step": 727000
},
{
"epoch": 6.712431145680516,
"grad_norm": 0.49617233872413635,
"learning_rate": 1.6437890405144817e-05,
"loss": 0.0215,
"num_input_tokens_seen": 744955392,
"step": 727500
},
{
"epoch": 6.717044500419815,
"grad_norm": 2.4510884284973145,
"learning_rate": 1.6414823631448317e-05,
"loss": 0.022,
"num_input_tokens_seen": 745467392,
"step": 728000
},
{
"epoch": 6.721657855159115,
"grad_norm": 0.6233497262001038,
"learning_rate": 1.639175685775182e-05,
"loss": 0.0209,
"num_input_tokens_seen": 745979392,
"step": 728500
},
{
"epoch": 6.726271209898414,
"grad_norm": 1.1352206468582153,
"learning_rate": 1.6368690084055325e-05,
"loss": 0.0194,
"num_input_tokens_seen": 746491392,
"step": 729000
},
{
"epoch": 6.730884564637713,
"grad_norm": 0.4292503297328949,
"learning_rate": 1.6345623310358826e-05,
"loss": 0.0213,
"num_input_tokens_seen": 747003392,
"step": 729500
},
{
"epoch": 6.735497919377012,
"grad_norm": 0.7327638864517212,
"learning_rate": 1.632255653666233e-05,
"loss": 0.0216,
"num_input_tokens_seen": 747515392,
"step": 730000
},
{
"epoch": 6.740111274116312,
"grad_norm": 1.2657952308654785,
"learning_rate": 1.6299489762965834e-05,
"loss": 0.021,
"num_input_tokens_seen": 748027392,
"step": 730500
},
{
"epoch": 6.744724628855611,
"grad_norm": 2.1072635650634766,
"learning_rate": 1.6276422989269338e-05,
"loss": 0.0205,
"num_input_tokens_seen": 748539392,
"step": 731000
},
{
"epoch": 6.749337983594911,
"grad_norm": 0.5420140027999878,
"learning_rate": 1.6253356215572842e-05,
"loss": 0.0217,
"num_input_tokens_seen": 749051392,
"step": 731500
},
{
"epoch": 6.75395133833421,
"grad_norm": 0.9647169709205627,
"learning_rate": 1.6230289441876346e-05,
"loss": 0.023,
"num_input_tokens_seen": 749563392,
"step": 732000
},
{
"epoch": 6.7585646930735095,
"grad_norm": 0.5795858502388,
"learning_rate": 1.6207222668179846e-05,
"loss": 0.0207,
"num_input_tokens_seen": 750075392,
"step": 732500
},
{
"epoch": 6.763178047812809,
"grad_norm": 0.776720404624939,
"learning_rate": 1.6184155894483354e-05,
"loss": 0.0239,
"num_input_tokens_seen": 750587392,
"step": 733000
},
{
"epoch": 6.767791402552108,
"grad_norm": 3.4119088649749756,
"learning_rate": 1.6161089120786854e-05,
"loss": 0.0234,
"num_input_tokens_seen": 751099392,
"step": 733500
},
{
"epoch": 6.772404757291407,
"grad_norm": 0.5689214468002319,
"learning_rate": 1.6138022347090358e-05,
"loss": 0.021,
"num_input_tokens_seen": 751611392,
"step": 734000
},
{
"epoch": 6.7770181120307065,
"grad_norm": 0.6440141201019287,
"learning_rate": 1.6114955573393862e-05,
"loss": 0.0228,
"num_input_tokens_seen": 752123392,
"step": 734500
},
{
"epoch": 6.781631466770006,
"grad_norm": 0.5016751289367676,
"learning_rate": 1.6091888799697366e-05,
"loss": 0.023,
"num_input_tokens_seen": 752635392,
"step": 735000
},
{
"epoch": 6.786244821509305,
"grad_norm": 0.6144362092018127,
"learning_rate": 1.6068822026000867e-05,
"loss": 0.0227,
"num_input_tokens_seen": 753147392,
"step": 735500
},
{
"epoch": 6.790858176248604,
"grad_norm": 0.356981486082077,
"learning_rate": 1.604575525230437e-05,
"loss": 0.0204,
"num_input_tokens_seen": 753659392,
"step": 736000
},
{
"epoch": 6.7954715309879035,
"grad_norm": 0.6662021279335022,
"learning_rate": 1.6022688478607875e-05,
"loss": 0.0222,
"num_input_tokens_seen": 754171392,
"step": 736500
},
{
"epoch": 6.800084885727204,
"grad_norm": 1.0647578239440918,
"learning_rate": 1.599962170491138e-05,
"loss": 0.0207,
"num_input_tokens_seen": 754683392,
"step": 737000
},
{
"epoch": 6.804698240466502,
"grad_norm": 0.8494476675987244,
"learning_rate": 1.5976554931214883e-05,
"loss": 0.0208,
"num_input_tokens_seen": 755195392,
"step": 737500
},
{
"epoch": 6.809311595205802,
"grad_norm": 1.5736192464828491,
"learning_rate": 1.5953488157518383e-05,
"loss": 0.0245,
"num_input_tokens_seen": 755707392,
"step": 738000
},
{
"epoch": 6.813924949945101,
"grad_norm": 1.5811710357666016,
"learning_rate": 1.593042138382189e-05,
"loss": 0.0198,
"num_input_tokens_seen": 756219392,
"step": 738500
},
{
"epoch": 6.818538304684401,
"grad_norm": 0.7430917024612427,
"learning_rate": 1.590735461012539e-05,
"loss": 0.0238,
"num_input_tokens_seen": 756731392,
"step": 739000
},
{
"epoch": 6.8231516594237,
"grad_norm": 0.346450537443161,
"learning_rate": 1.5884287836428895e-05,
"loss": 0.0207,
"num_input_tokens_seen": 757243392,
"step": 739500
},
{
"epoch": 6.827765014162999,
"grad_norm": 5.301863670349121,
"learning_rate": 1.58612210627324e-05,
"loss": 0.0228,
"num_input_tokens_seen": 757755392,
"step": 740000
},
{
"epoch": 6.832378368902298,
"grad_norm": 0.9501894116401672,
"learning_rate": 1.5838154289035903e-05,
"loss": 0.0217,
"num_input_tokens_seen": 758267392,
"step": 740500
},
{
"epoch": 6.836991723641598,
"grad_norm": 0.4030236601829529,
"learning_rate": 1.5815087515339404e-05,
"loss": 0.0233,
"num_input_tokens_seen": 758779392,
"step": 741000
},
{
"epoch": 6.841605078380897,
"grad_norm": 3.976102352142334,
"learning_rate": 1.579202074164291e-05,
"loss": 0.0229,
"num_input_tokens_seen": 759291392,
"step": 741500
},
{
"epoch": 6.846218433120196,
"grad_norm": 1.0763275623321533,
"learning_rate": 1.576895396794641e-05,
"loss": 0.0247,
"num_input_tokens_seen": 759803392,
"step": 742000
},
{
"epoch": 6.850831787859495,
"grad_norm": 1.278295636177063,
"learning_rate": 1.5745887194249916e-05,
"loss": 0.0207,
"num_input_tokens_seen": 760315392,
"step": 742500
},
{
"epoch": 6.855445142598795,
"grad_norm": 1.3523164987564087,
"learning_rate": 1.572282042055342e-05,
"loss": 0.0215,
"num_input_tokens_seen": 760827392,
"step": 743000
},
{
"epoch": 6.860058497338095,
"grad_norm": 2.487576484680176,
"learning_rate": 1.569975364685692e-05,
"loss": 0.0197,
"num_input_tokens_seen": 761339392,
"step": 743500
},
{
"epoch": 6.864671852077394,
"grad_norm": 0.43189629912376404,
"learning_rate": 1.5676686873160428e-05,
"loss": 0.0209,
"num_input_tokens_seen": 761851392,
"step": 744000
},
{
"epoch": 6.869285206816693,
"grad_norm": 1.3960847854614258,
"learning_rate": 1.5653620099463928e-05,
"loss": 0.0211,
"num_input_tokens_seen": 762363392,
"step": 744500
},
{
"epoch": 6.873898561555992,
"grad_norm": 0.642167866230011,
"learning_rate": 1.5630553325767432e-05,
"loss": 0.0228,
"num_input_tokens_seen": 762875392,
"step": 745000
},
{
"epoch": 6.878511916295292,
"grad_norm": 0.7163909673690796,
"learning_rate": 1.5607486552070936e-05,
"loss": 0.0225,
"num_input_tokens_seen": 763387392,
"step": 745500
},
{
"epoch": 6.883125271034591,
"grad_norm": 0.8028944134712219,
"learning_rate": 1.558441977837444e-05,
"loss": 0.0217,
"num_input_tokens_seen": 763899392,
"step": 746000
},
{
"epoch": 6.88773862577389,
"grad_norm": 0.8963446617126465,
"learning_rate": 1.556135300467794e-05,
"loss": 0.0233,
"num_input_tokens_seen": 764411392,
"step": 746500
},
{
"epoch": 6.892351980513189,
"grad_norm": 1.2736632823944092,
"learning_rate": 1.5538286230981448e-05,
"loss": 0.0198,
"num_input_tokens_seen": 764923392,
"step": 747000
},
{
"epoch": 6.896965335252489,
"grad_norm": 1.9002121686935425,
"learning_rate": 1.551521945728495e-05,
"loss": 0.0204,
"num_input_tokens_seen": 765435392,
"step": 747500
},
{
"epoch": 6.901578689991788,
"grad_norm": 1.7518917322158813,
"learning_rate": 1.5492152683588453e-05,
"loss": 0.0212,
"num_input_tokens_seen": 765947392,
"step": 748000
},
{
"epoch": 6.906192044731087,
"grad_norm": 0.5055529475212097,
"learning_rate": 1.5469085909891956e-05,
"loss": 0.0223,
"num_input_tokens_seen": 766459392,
"step": 748500
},
{
"epoch": 6.910805399470387,
"grad_norm": 1.280887246131897,
"learning_rate": 1.5446019136195457e-05,
"loss": 0.0215,
"num_input_tokens_seen": 766971392,
"step": 749000
},
{
"epoch": 6.9154187542096865,
"grad_norm": 1.3082467317581177,
"learning_rate": 1.5422952362498964e-05,
"loss": 0.0239,
"num_input_tokens_seen": 767483392,
"step": 749500
},
{
"epoch": 6.920032108948986,
"grad_norm": 0.4849281907081604,
"learning_rate": 1.5399885588802465e-05,
"loss": 0.0219,
"num_input_tokens_seen": 767995392,
"step": 750000
},
{
"epoch": 6.924645463688285,
"grad_norm": 1.54342520236969,
"learning_rate": 1.537681881510597e-05,
"loss": 0.0212,
"num_input_tokens_seen": 768507392,
"step": 750500
},
{
"epoch": 6.929258818427584,
"grad_norm": 1.441550850868225,
"learning_rate": 1.5353752041409473e-05,
"loss": 0.0232,
"num_input_tokens_seen": 769019392,
"step": 751000
},
{
"epoch": 6.9338721731668835,
"grad_norm": 1.3304697275161743,
"learning_rate": 1.5330685267712977e-05,
"loss": 0.0229,
"num_input_tokens_seen": 769531392,
"step": 751500
},
{
"epoch": 6.938485527906183,
"grad_norm": 1.3655359745025635,
"learning_rate": 1.5307618494016477e-05,
"loss": 0.0232,
"num_input_tokens_seen": 770043392,
"step": 752000
},
{
"epoch": 6.943098882645482,
"grad_norm": 1.3380628824234009,
"learning_rate": 1.5284551720319985e-05,
"loss": 0.0208,
"num_input_tokens_seen": 770555392,
"step": 752500
},
{
"epoch": 6.947712237384781,
"grad_norm": 0.7669854164123535,
"learning_rate": 1.5261484946623485e-05,
"loss": 0.0219,
"num_input_tokens_seen": 771067392,
"step": 753000
},
{
"epoch": 6.9523255921240805,
"grad_norm": 0.653236985206604,
"learning_rate": 1.5238418172926991e-05,
"loss": 0.0197,
"num_input_tokens_seen": 771579392,
"step": 753500
},
{
"epoch": 6.95693894686338,
"grad_norm": 0.7252629995346069,
"learning_rate": 1.5215351399230493e-05,
"loss": 0.0231,
"num_input_tokens_seen": 772091392,
"step": 754000
},
{
"epoch": 6.96155230160268,
"grad_norm": 0.7869466543197632,
"learning_rate": 1.5192284625533997e-05,
"loss": 0.0219,
"num_input_tokens_seen": 772603392,
"step": 754500
},
{
"epoch": 6.966165656341978,
"grad_norm": 1.048891544342041,
"learning_rate": 1.51692178518375e-05,
"loss": 0.0246,
"num_input_tokens_seen": 773115392,
"step": 755000
},
{
"epoch": 6.970779011081278,
"grad_norm": 0.7492154836654663,
"learning_rate": 1.5146151078141002e-05,
"loss": 0.0207,
"num_input_tokens_seen": 773627392,
"step": 755500
},
{
"epoch": 6.975392365820578,
"grad_norm": 1.5296510457992554,
"learning_rate": 1.5123084304444508e-05,
"loss": 0.023,
"num_input_tokens_seen": 774139392,
"step": 756000
},
{
"epoch": 6.980005720559877,
"grad_norm": 0.6391850113868713,
"learning_rate": 1.510001753074801e-05,
"loss": 0.0211,
"num_input_tokens_seen": 774651392,
"step": 756500
},
{
"epoch": 6.984619075299176,
"grad_norm": 1.2069010734558105,
"learning_rate": 1.5076950757051514e-05,
"loss": 0.0207,
"num_input_tokens_seen": 775163392,
"step": 757000
},
{
"epoch": 6.989232430038475,
"grad_norm": 2.368687629699707,
"learning_rate": 1.5053883983355016e-05,
"loss": 0.024,
"num_input_tokens_seen": 775675392,
"step": 757500
},
{
"epoch": 6.993845784777775,
"grad_norm": 1.284287452697754,
"learning_rate": 1.5030817209658522e-05,
"loss": 0.0198,
"num_input_tokens_seen": 776187392,
"step": 758000
},
{
"epoch": 6.998459139517074,
"grad_norm": 5.402317523956299,
"learning_rate": 1.5007750435962022e-05,
"loss": 0.025,
"num_input_tokens_seen": 776699392,
"step": 758500
},
{
"epoch": 7.0,
"eval_combined_score": 0.06412914552577642,
"eval_loss": 0.06412914395332336,
"eval_mse": 0.06412914709822949,
"eval_runtime": 47.0336,
"eval_samples_per_second": 2048.28,
"eval_steps_per_second": 256.051,
"num_input_tokens_seen": 776869632,
"step": 758667
},
{
"epoch": 7.003072494256373,
"grad_norm": 1.346767783164978,
"learning_rate": 1.4984683662265528e-05,
"loss": 0.018,
"num_input_tokens_seen": 777210624,
"step": 759000
},
{
"epoch": 7.007685848995672,
"grad_norm": 0.9796298146247864,
"learning_rate": 1.496161688856903e-05,
"loss": 0.0184,
"num_input_tokens_seen": 777722624,
"step": 759500
},
{
"epoch": 7.012299203734972,
"grad_norm": 1.2551716566085815,
"learning_rate": 1.4938550114872534e-05,
"loss": 0.0173,
"num_input_tokens_seen": 778234624,
"step": 760000
},
{
"epoch": 7.016912558474272,
"grad_norm": 0.8987337946891785,
"learning_rate": 1.4915483341176037e-05,
"loss": 0.0177,
"num_input_tokens_seen": 778746624,
"step": 760500
},
{
"epoch": 7.021525913213571,
"grad_norm": 0.38303157687187195,
"learning_rate": 1.4892416567479542e-05,
"loss": 0.0179,
"num_input_tokens_seen": 779258624,
"step": 761000
},
{
"epoch": 7.02613926795287,
"grad_norm": 1.3380213975906372,
"learning_rate": 1.4869349793783044e-05,
"loss": 0.0183,
"num_input_tokens_seen": 779770624,
"step": 761500
},
{
"epoch": 7.0307526226921695,
"grad_norm": 2.466179609298706,
"learning_rate": 1.4846283020086547e-05,
"loss": 0.0175,
"num_input_tokens_seen": 780282624,
"step": 762000
},
{
"epoch": 7.035365977431469,
"grad_norm": 0.4640190303325653,
"learning_rate": 1.482321624639005e-05,
"loss": 0.0168,
"num_input_tokens_seen": 780794624,
"step": 762500
},
{
"epoch": 7.039979332170768,
"grad_norm": 0.6390454173088074,
"learning_rate": 1.4800149472693553e-05,
"loss": 0.0165,
"num_input_tokens_seen": 781306624,
"step": 763000
},
{
"epoch": 7.044592686910067,
"grad_norm": 0.9119462966918945,
"learning_rate": 1.4777082698997059e-05,
"loss": 0.0162,
"num_input_tokens_seen": 781818624,
"step": 763500
},
{
"epoch": 7.0492060416493665,
"grad_norm": 1.088921070098877,
"learning_rate": 1.475401592530056e-05,
"loss": 0.0182,
"num_input_tokens_seen": 782330624,
"step": 764000
},
{
"epoch": 7.053819396388666,
"grad_norm": 0.5869113802909851,
"learning_rate": 1.4730949151604065e-05,
"loss": 0.0191,
"num_input_tokens_seen": 782842624,
"step": 764500
},
{
"epoch": 7.058432751127965,
"grad_norm": 1.6925584077835083,
"learning_rate": 1.4707882377907567e-05,
"loss": 0.0187,
"num_input_tokens_seen": 783354624,
"step": 765000
},
{
"epoch": 7.063046105867264,
"grad_norm": 1.0733281373977661,
"learning_rate": 1.4684815604211071e-05,
"loss": 0.0171,
"num_input_tokens_seen": 783866624,
"step": 765500
},
{
"epoch": 7.0676594606065635,
"grad_norm": 0.3278258442878723,
"learning_rate": 1.4661748830514573e-05,
"loss": 0.0183,
"num_input_tokens_seen": 784378624,
"step": 766000
},
{
"epoch": 7.072272815345864,
"grad_norm": 2.2622592449188232,
"learning_rate": 1.4638682056818079e-05,
"loss": 0.0171,
"num_input_tokens_seen": 784890624,
"step": 766500
},
{
"epoch": 7.076886170085163,
"grad_norm": 0.846518337726593,
"learning_rate": 1.4615615283121581e-05,
"loss": 0.0178,
"num_input_tokens_seen": 785402624,
"step": 767000
},
{
"epoch": 7.081499524824462,
"grad_norm": 0.9698590636253357,
"learning_rate": 1.4592548509425085e-05,
"loss": 0.0173,
"num_input_tokens_seen": 785914624,
"step": 767500
},
{
"epoch": 7.086112879563761,
"grad_norm": 0.5238065123558044,
"learning_rate": 1.4569481735728588e-05,
"loss": 0.0165,
"num_input_tokens_seen": 786426624,
"step": 768000
},
{
"epoch": 7.090726234303061,
"grad_norm": 0.7391173839569092,
"learning_rate": 1.454641496203209e-05,
"loss": 0.0193,
"num_input_tokens_seen": 786938624,
"step": 768500
},
{
"epoch": 7.09533958904236,
"grad_norm": 0.8646796941757202,
"learning_rate": 1.4523348188335596e-05,
"loss": 0.0182,
"num_input_tokens_seen": 787450624,
"step": 769000
},
{
"epoch": 7.099952943781659,
"grad_norm": 0.5301780700683594,
"learning_rate": 1.4500281414639096e-05,
"loss": 0.017,
"num_input_tokens_seen": 787962624,
"step": 769500
},
{
"epoch": 7.104566298520958,
"grad_norm": 2.3351125717163086,
"learning_rate": 1.4477214640942602e-05,
"loss": 0.0182,
"num_input_tokens_seen": 788474624,
"step": 770000
},
{
"epoch": 7.109179653260258,
"grad_norm": 0.59925377368927,
"learning_rate": 1.4454147867246104e-05,
"loss": 0.0164,
"num_input_tokens_seen": 788986624,
"step": 770500
},
{
"epoch": 7.113793007999557,
"grad_norm": 0.5372639298439026,
"learning_rate": 1.4431081093549608e-05,
"loss": 0.0181,
"num_input_tokens_seen": 789498624,
"step": 771000
},
{
"epoch": 7.118406362738856,
"grad_norm": 1.028199553489685,
"learning_rate": 1.440801431985311e-05,
"loss": 0.0179,
"num_input_tokens_seen": 790010624,
"step": 771500
},
{
"epoch": 7.123019717478156,
"grad_norm": 0.32566505670547485,
"learning_rate": 1.4384947546156616e-05,
"loss": 0.0177,
"num_input_tokens_seen": 790522624,
"step": 772000
},
{
"epoch": 7.1276330722174555,
"grad_norm": 1.434348702430725,
"learning_rate": 1.4361880772460118e-05,
"loss": 0.0195,
"num_input_tokens_seen": 791034624,
"step": 772500
},
{
"epoch": 7.132246426956755,
"grad_norm": 1.0634896755218506,
"learning_rate": 1.4338813998763622e-05,
"loss": 0.0172,
"num_input_tokens_seen": 791546624,
"step": 773000
},
{
"epoch": 7.136859781696054,
"grad_norm": 1.0522830486297607,
"learning_rate": 1.4315747225067125e-05,
"loss": 0.017,
"num_input_tokens_seen": 792058624,
"step": 773500
},
{
"epoch": 7.141473136435353,
"grad_norm": 1.2891104221343994,
"learning_rate": 1.429268045137063e-05,
"loss": 0.0173,
"num_input_tokens_seen": 792570624,
"step": 774000
},
{
"epoch": 7.1460864911746524,
"grad_norm": 0.5944826006889343,
"learning_rate": 1.4269613677674132e-05,
"loss": 0.0168,
"num_input_tokens_seen": 793082624,
"step": 774500
},
{
"epoch": 7.150699845913952,
"grad_norm": 1.0896071195602417,
"learning_rate": 1.4246546903977635e-05,
"loss": 0.0195,
"num_input_tokens_seen": 793594624,
"step": 775000
},
{
"epoch": 7.155313200653251,
"grad_norm": 0.5116850137710571,
"learning_rate": 1.4223480130281139e-05,
"loss": 0.0181,
"num_input_tokens_seen": 794106624,
"step": 775500
},
{
"epoch": 7.15992655539255,
"grad_norm": 0.6353034377098083,
"learning_rate": 1.4200413356584641e-05,
"loss": 0.015,
"num_input_tokens_seen": 794618624,
"step": 776000
},
{
"epoch": 7.164539910131849,
"grad_norm": 2.1156020164489746,
"learning_rate": 1.4177346582888145e-05,
"loss": 0.0181,
"num_input_tokens_seen": 795130624,
"step": 776500
},
{
"epoch": 7.169153264871149,
"grad_norm": 0.4953656494617462,
"learning_rate": 1.4154279809191647e-05,
"loss": 0.0174,
"num_input_tokens_seen": 795642624,
"step": 777000
},
{
"epoch": 7.173766619610448,
"grad_norm": 0.39725926518440247,
"learning_rate": 1.4131213035495153e-05,
"loss": 0.0178,
"num_input_tokens_seen": 796154624,
"step": 777500
},
{
"epoch": 7.178379974349748,
"grad_norm": 0.7973536849021912,
"learning_rate": 1.4108146261798655e-05,
"loss": 0.0192,
"num_input_tokens_seen": 796666624,
"step": 778000
},
{
"epoch": 7.182993329089047,
"grad_norm": 0.27644041180610657,
"learning_rate": 1.4085079488102159e-05,
"loss": 0.0187,
"num_input_tokens_seen": 797178624,
"step": 778500
},
{
"epoch": 7.1876066838283466,
"grad_norm": 0.5681914687156677,
"learning_rate": 1.4062012714405661e-05,
"loss": 0.0175,
"num_input_tokens_seen": 797690624,
"step": 779000
},
{
"epoch": 7.192220038567646,
"grad_norm": 0.19514349102973938,
"learning_rate": 1.4038945940709167e-05,
"loss": 0.0176,
"num_input_tokens_seen": 798202624,
"step": 779500
},
{
"epoch": 7.196833393306945,
"grad_norm": 1.4721050262451172,
"learning_rate": 1.401587916701267e-05,
"loss": 0.0188,
"num_input_tokens_seen": 798714624,
"step": 780000
},
{
"epoch": 7.201446748046244,
"grad_norm": 0.7421937584877014,
"learning_rate": 1.3992812393316173e-05,
"loss": 0.0188,
"num_input_tokens_seen": 799226624,
"step": 780500
},
{
"epoch": 7.2060601027855435,
"grad_norm": 0.12846527993679047,
"learning_rate": 1.3969745619619676e-05,
"loss": 0.018,
"num_input_tokens_seen": 799738624,
"step": 781000
},
{
"epoch": 7.210673457524843,
"grad_norm": 0.8358561992645264,
"learning_rate": 1.3946678845923178e-05,
"loss": 0.018,
"num_input_tokens_seen": 800250624,
"step": 781500
},
{
"epoch": 7.215286812264142,
"grad_norm": 1.0720690488815308,
"learning_rate": 1.3923612072226684e-05,
"loss": 0.0161,
"num_input_tokens_seen": 800762624,
"step": 782000
},
{
"epoch": 7.219900167003441,
"grad_norm": 0.4553976356983185,
"learning_rate": 1.3900545298530184e-05,
"loss": 0.0188,
"num_input_tokens_seen": 801274624,
"step": 782500
},
{
"epoch": 7.2245135217427405,
"grad_norm": 1.1510006189346313,
"learning_rate": 1.387747852483369e-05,
"loss": 0.018,
"num_input_tokens_seen": 801786624,
"step": 783000
},
{
"epoch": 7.22912687648204,
"grad_norm": 1.1483092308044434,
"learning_rate": 1.3854411751137192e-05,
"loss": 0.0171,
"num_input_tokens_seen": 802298624,
"step": 783500
},
{
"epoch": 7.23374023122134,
"grad_norm": 0.4925529658794403,
"learning_rate": 1.3831344977440696e-05,
"loss": 0.0173,
"num_input_tokens_seen": 802810624,
"step": 784000
},
{
"epoch": 7.238353585960639,
"grad_norm": 0.3787945508956909,
"learning_rate": 1.3808278203744198e-05,
"loss": 0.0188,
"num_input_tokens_seen": 803322624,
"step": 784500
},
{
"epoch": 7.242966940699938,
"grad_norm": 0.6160422563552856,
"learning_rate": 1.3785211430047704e-05,
"loss": 0.018,
"num_input_tokens_seen": 803834624,
"step": 785000
},
{
"epoch": 7.247580295439238,
"grad_norm": 1.1294529438018799,
"learning_rate": 1.3762144656351206e-05,
"loss": 0.02,
"num_input_tokens_seen": 804346624,
"step": 785500
},
{
"epoch": 7.252193650178537,
"grad_norm": 0.6138213872909546,
"learning_rate": 1.373907788265471e-05,
"loss": 0.0175,
"num_input_tokens_seen": 804858624,
"step": 786000
},
{
"epoch": 7.256807004917836,
"grad_norm": 0.5684888362884521,
"learning_rate": 1.3716011108958212e-05,
"loss": 0.0166,
"num_input_tokens_seen": 805370624,
"step": 786500
},
{
"epoch": 7.261420359657135,
"grad_norm": 0.7051540613174438,
"learning_rate": 1.3692944335261718e-05,
"loss": 0.0161,
"num_input_tokens_seen": 805882624,
"step": 787000
},
{
"epoch": 7.266033714396435,
"grad_norm": 0.7892741560935974,
"learning_rate": 1.366987756156522e-05,
"loss": 0.0193,
"num_input_tokens_seen": 806394624,
"step": 787500
},
{
"epoch": 7.270647069135734,
"grad_norm": 1.084768533706665,
"learning_rate": 1.3646810787868721e-05,
"loss": 0.0178,
"num_input_tokens_seen": 806906624,
"step": 788000
},
{
"epoch": 7.275260423875033,
"grad_norm": 1.111611008644104,
"learning_rate": 1.3623744014172227e-05,
"loss": 0.0181,
"num_input_tokens_seen": 807418624,
"step": 788500
},
{
"epoch": 7.279873778614332,
"grad_norm": 1.2572911977767944,
"learning_rate": 1.3600677240475729e-05,
"loss": 0.0216,
"num_input_tokens_seen": 807930624,
"step": 789000
},
{
"epoch": 7.2844871333536325,
"grad_norm": 1.4147090911865234,
"learning_rate": 1.3577610466779233e-05,
"loss": 0.0174,
"num_input_tokens_seen": 808442624,
"step": 789500
},
{
"epoch": 7.289100488092932,
"grad_norm": 1.129238247871399,
"learning_rate": 1.3554543693082735e-05,
"loss": 0.0174,
"num_input_tokens_seen": 808954624,
"step": 790000
},
{
"epoch": 7.293713842832231,
"grad_norm": 0.7517364621162415,
"learning_rate": 1.3531476919386241e-05,
"loss": 0.0198,
"num_input_tokens_seen": 809466624,
"step": 790500
},
{
"epoch": 7.29832719757153,
"grad_norm": 2.005709171295166,
"learning_rate": 1.3508410145689743e-05,
"loss": 0.019,
"num_input_tokens_seen": 809978624,
"step": 791000
},
{
"epoch": 7.3029405523108295,
"grad_norm": 0.5718657374382019,
"learning_rate": 1.3485343371993247e-05,
"loss": 0.0188,
"num_input_tokens_seen": 810490624,
"step": 791500
},
{
"epoch": 7.307553907050129,
"grad_norm": 2.84344744682312,
"learning_rate": 1.346227659829675e-05,
"loss": 0.018,
"num_input_tokens_seen": 811002624,
"step": 792000
},
{
"epoch": 7.312167261789428,
"grad_norm": 1.8831250667572021,
"learning_rate": 1.3439209824600255e-05,
"loss": 0.0198,
"num_input_tokens_seen": 811514624,
"step": 792500
},
{
"epoch": 7.316780616528727,
"grad_norm": 0.42998257279396057,
"learning_rate": 1.3416143050903757e-05,
"loss": 0.0182,
"num_input_tokens_seen": 812026624,
"step": 793000
},
{
"epoch": 7.3213939712680265,
"grad_norm": 0.4875911474227905,
"learning_rate": 1.3393076277207261e-05,
"loss": 0.0202,
"num_input_tokens_seen": 812538624,
"step": 793500
},
{
"epoch": 7.326007326007326,
"grad_norm": 0.6313169002532959,
"learning_rate": 1.3370009503510764e-05,
"loss": 0.0185,
"num_input_tokens_seen": 813050624,
"step": 794000
},
{
"epoch": 7.330620680746625,
"grad_norm": 0.5315720438957214,
"learning_rate": 1.3346942729814266e-05,
"loss": 0.0175,
"num_input_tokens_seen": 813562624,
"step": 794500
},
{
"epoch": 7.335234035485925,
"grad_norm": 0.636077344417572,
"learning_rate": 1.332387595611777e-05,
"loss": 0.0187,
"num_input_tokens_seen": 814074624,
"step": 795000
},
{
"epoch": 7.339847390225224,
"grad_norm": 1.2620755434036255,
"learning_rate": 1.3300809182421272e-05,
"loss": 0.0185,
"num_input_tokens_seen": 814586624,
"step": 795500
},
{
"epoch": 7.344460744964524,
"grad_norm": 0.40610164403915405,
"learning_rate": 1.3277742408724778e-05,
"loss": 0.0179,
"num_input_tokens_seen": 815098624,
"step": 796000
},
{
"epoch": 7.349074099703823,
"grad_norm": 0.5910019278526306,
"learning_rate": 1.325467563502828e-05,
"loss": 0.0191,
"num_input_tokens_seen": 815610624,
"step": 796500
},
{
"epoch": 7.353687454443122,
"grad_norm": 0.9699934720993042,
"learning_rate": 1.3231608861331784e-05,
"loss": 0.0173,
"num_input_tokens_seen": 816122624,
"step": 797000
},
{
"epoch": 7.358300809182421,
"grad_norm": 0.5334429740905762,
"learning_rate": 1.3208542087635286e-05,
"loss": 0.0185,
"num_input_tokens_seen": 816634624,
"step": 797500
},
{
"epoch": 7.362914163921721,
"grad_norm": 0.47226250171661377,
"learning_rate": 1.3185475313938792e-05,
"loss": 0.0181,
"num_input_tokens_seen": 817146624,
"step": 798000
},
{
"epoch": 7.36752751866102,
"grad_norm": 3.1056435108184814,
"learning_rate": 1.3162408540242294e-05,
"loss": 0.0189,
"num_input_tokens_seen": 817658624,
"step": 798500
},
{
"epoch": 7.372140873400319,
"grad_norm": 0.8559852838516235,
"learning_rate": 1.3139341766545798e-05,
"loss": 0.0186,
"num_input_tokens_seen": 818170624,
"step": 799000
},
{
"epoch": 7.376754228139618,
"grad_norm": 0.5092094540596008,
"learning_rate": 1.31162749928493e-05,
"loss": 0.0182,
"num_input_tokens_seen": 818682624,
"step": 799500
},
{
"epoch": 7.381367582878918,
"grad_norm": 0.7403343915939331,
"learning_rate": 1.3093208219152806e-05,
"loss": 0.0187,
"num_input_tokens_seen": 819194624,
"step": 800000
},
{
"epoch": 7.385980937618217,
"grad_norm": 1.0396490097045898,
"learning_rate": 1.3070141445456308e-05,
"loss": 0.0176,
"num_input_tokens_seen": 819706624,
"step": 800500
},
{
"epoch": 7.390594292357516,
"grad_norm": 1.229277491569519,
"learning_rate": 1.3047074671759809e-05,
"loss": 0.0166,
"num_input_tokens_seen": 820218624,
"step": 801000
},
{
"epoch": 7.395207647096816,
"grad_norm": 1.870112419128418,
"learning_rate": 1.3024007898063315e-05,
"loss": 0.0184,
"num_input_tokens_seen": 820730624,
"step": 801500
},
{
"epoch": 7.3998210018361155,
"grad_norm": 2.495352029800415,
"learning_rate": 1.3000941124366817e-05,
"loss": 0.0176,
"num_input_tokens_seen": 821242624,
"step": 802000
},
{
"epoch": 7.404434356575415,
"grad_norm": 1.2543821334838867,
"learning_rate": 1.2977874350670321e-05,
"loss": 0.0187,
"num_input_tokens_seen": 821754624,
"step": 802500
},
{
"epoch": 7.409047711314714,
"grad_norm": 0.9267345666885376,
"learning_rate": 1.2954807576973823e-05,
"loss": 0.0167,
"num_input_tokens_seen": 822266624,
"step": 803000
},
{
"epoch": 7.413661066054013,
"grad_norm": 0.7813261151313782,
"learning_rate": 1.2931740803277329e-05,
"loss": 0.0178,
"num_input_tokens_seen": 822778624,
"step": 803500
},
{
"epoch": 7.4182744207933125,
"grad_norm": 2.1433377265930176,
"learning_rate": 1.2908674029580831e-05,
"loss": 0.0185,
"num_input_tokens_seen": 823290624,
"step": 804000
},
{
"epoch": 7.422887775532612,
"grad_norm": 0.4169975519180298,
"learning_rate": 1.2885607255884335e-05,
"loss": 0.0174,
"num_input_tokens_seen": 823802624,
"step": 804500
},
{
"epoch": 7.427501130271911,
"grad_norm": 0.7654904723167419,
"learning_rate": 1.2862540482187837e-05,
"loss": 0.0189,
"num_input_tokens_seen": 824314624,
"step": 805000
},
{
"epoch": 7.43211448501121,
"grad_norm": 0.7712762355804443,
"learning_rate": 1.2839473708491343e-05,
"loss": 0.0169,
"num_input_tokens_seen": 824826624,
"step": 805500
},
{
"epoch": 7.436727839750509,
"grad_norm": 1.179842233657837,
"learning_rate": 1.2816406934794845e-05,
"loss": 0.0169,
"num_input_tokens_seen": 825338624,
"step": 806000
},
{
"epoch": 7.441341194489809,
"grad_norm": 1.1706069707870483,
"learning_rate": 1.279334016109835e-05,
"loss": 0.0191,
"num_input_tokens_seen": 825850624,
"step": 806500
},
{
"epoch": 7.445954549229109,
"grad_norm": 1.7458144426345825,
"learning_rate": 1.2770273387401852e-05,
"loss": 0.0183,
"num_input_tokens_seen": 826362624,
"step": 807000
},
{
"epoch": 7.450567903968408,
"grad_norm": 0.8518096804618835,
"learning_rate": 1.2747206613705354e-05,
"loss": 0.0181,
"num_input_tokens_seen": 826874624,
"step": 807500
},
{
"epoch": 7.455181258707707,
"grad_norm": 0.6776919960975647,
"learning_rate": 1.2724139840008858e-05,
"loss": 0.0167,
"num_input_tokens_seen": 827386624,
"step": 808000
},
{
"epoch": 7.459794613447007,
"grad_norm": 1.8147574663162231,
"learning_rate": 1.270107306631236e-05,
"loss": 0.0172,
"num_input_tokens_seen": 827898624,
"step": 808500
},
{
"epoch": 7.464407968186306,
"grad_norm": 0.730553150177002,
"learning_rate": 1.2678006292615866e-05,
"loss": 0.0163,
"num_input_tokens_seen": 828410624,
"step": 809000
},
{
"epoch": 7.469021322925605,
"grad_norm": 0.5966499447822571,
"learning_rate": 1.2654939518919368e-05,
"loss": 0.0185,
"num_input_tokens_seen": 828922624,
"step": 809500
},
{
"epoch": 7.473634677664904,
"grad_norm": 0.5111476182937622,
"learning_rate": 1.2631872745222872e-05,
"loss": 0.0181,
"num_input_tokens_seen": 829434624,
"step": 810000
},
{
"epoch": 7.4782480324042035,
"grad_norm": 1.1634365320205688,
"learning_rate": 1.2608805971526374e-05,
"loss": 0.0162,
"num_input_tokens_seen": 829946624,
"step": 810500
},
{
"epoch": 7.482861387143503,
"grad_norm": 1.030910611152649,
"learning_rate": 1.258573919782988e-05,
"loss": 0.0184,
"num_input_tokens_seen": 830458624,
"step": 811000
},
{
"epoch": 7.487474741882802,
"grad_norm": 1.035938024520874,
"learning_rate": 1.2562672424133382e-05,
"loss": 0.0177,
"num_input_tokens_seen": 830970624,
"step": 811500
},
{
"epoch": 7.492088096622101,
"grad_norm": 1.1685384511947632,
"learning_rate": 1.2539605650436886e-05,
"loss": 0.0173,
"num_input_tokens_seen": 831482624,
"step": 812000
},
{
"epoch": 7.496701451361401,
"grad_norm": 0.8186880946159363,
"learning_rate": 1.2516538876740388e-05,
"loss": 0.0175,
"num_input_tokens_seen": 831994624,
"step": 812500
},
{
"epoch": 7.501314806100701,
"grad_norm": 1.2309128046035767,
"learning_rate": 1.2493472103043892e-05,
"loss": 0.0187,
"num_input_tokens_seen": 832506624,
"step": 813000
},
{
"epoch": 7.50592816084,
"grad_norm": 0.9243940114974976,
"learning_rate": 1.2470405329347395e-05,
"loss": 0.0208,
"num_input_tokens_seen": 833018624,
"step": 813500
},
{
"epoch": 7.510541515579299,
"grad_norm": 1.5183156728744507,
"learning_rate": 1.2447338555650899e-05,
"loss": 0.0188,
"num_input_tokens_seen": 833530624,
"step": 814000
},
{
"epoch": 7.515154870318598,
"grad_norm": 0.7042239904403687,
"learning_rate": 1.2424271781954403e-05,
"loss": 0.0175,
"num_input_tokens_seen": 834042624,
"step": 814500
},
{
"epoch": 7.519768225057898,
"grad_norm": 0.7798308730125427,
"learning_rate": 1.2401205008257907e-05,
"loss": 0.0203,
"num_input_tokens_seen": 834554624,
"step": 815000
},
{
"epoch": 7.524381579797197,
"grad_norm": 0.6466756463050842,
"learning_rate": 1.2378138234561409e-05,
"loss": 0.019,
"num_input_tokens_seen": 835066624,
"step": 815500
},
{
"epoch": 7.528994934536496,
"grad_norm": 1.0861841440200806,
"learning_rate": 1.2355071460864913e-05,
"loss": 0.0177,
"num_input_tokens_seen": 835578624,
"step": 816000
},
{
"epoch": 7.533608289275795,
"grad_norm": 2.7624402046203613,
"learning_rate": 1.2332004687168417e-05,
"loss": 0.0175,
"num_input_tokens_seen": 836090624,
"step": 816500
},
{
"epoch": 7.538221644015095,
"grad_norm": 1.2840367555618286,
"learning_rate": 1.2308937913471919e-05,
"loss": 0.018,
"num_input_tokens_seen": 836602624,
"step": 817000
},
{
"epoch": 7.542834998754394,
"grad_norm": 0.6789388656616211,
"learning_rate": 1.2285871139775421e-05,
"loss": 0.0183,
"num_input_tokens_seen": 837114624,
"step": 817500
},
{
"epoch": 7.547448353493694,
"grad_norm": 0.5279095768928528,
"learning_rate": 1.2262804366078925e-05,
"loss": 0.0184,
"num_input_tokens_seen": 837626624,
"step": 818000
},
{
"epoch": 7.552061708232992,
"grad_norm": 0.5110554099082947,
"learning_rate": 1.223973759238243e-05,
"loss": 0.0178,
"num_input_tokens_seen": 838138624,
"step": 818500
},
{
"epoch": 7.5566750629722925,
"grad_norm": 1.535260796546936,
"learning_rate": 1.2216670818685932e-05,
"loss": 0.0189,
"num_input_tokens_seen": 838650624,
"step": 819000
},
{
"epoch": 7.561288417711592,
"grad_norm": 3.005444049835205,
"learning_rate": 1.2193604044989436e-05,
"loss": 0.0194,
"num_input_tokens_seen": 839162624,
"step": 819500
},
{
"epoch": 7.565901772450891,
"grad_norm": 0.3890930712223053,
"learning_rate": 1.217053727129294e-05,
"loss": 0.0182,
"num_input_tokens_seen": 839674624,
"step": 820000
},
{
"epoch": 7.57051512719019,
"grad_norm": 3.0413002967834473,
"learning_rate": 1.2147470497596444e-05,
"loss": 0.0179,
"num_input_tokens_seen": 840186624,
"step": 820500
},
{
"epoch": 7.5751284819294895,
"grad_norm": 0.33747154474258423,
"learning_rate": 1.2124403723899946e-05,
"loss": 0.0182,
"num_input_tokens_seen": 840698624,
"step": 821000
},
{
"epoch": 7.579741836668789,
"grad_norm": 0.7888673543930054,
"learning_rate": 1.210133695020345e-05,
"loss": 0.0158,
"num_input_tokens_seen": 841210624,
"step": 821500
},
{
"epoch": 7.584355191408088,
"grad_norm": 0.5673322081565857,
"learning_rate": 1.2078270176506954e-05,
"loss": 0.0175,
"num_input_tokens_seen": 841722624,
"step": 822000
},
{
"epoch": 7.588968546147387,
"grad_norm": 7.8960700035095215,
"learning_rate": 1.2055203402810456e-05,
"loss": 0.0171,
"num_input_tokens_seen": 842234624,
"step": 822500
},
{
"epoch": 7.5935819008866865,
"grad_norm": 0.6810684204101562,
"learning_rate": 1.203213662911396e-05,
"loss": 0.0169,
"num_input_tokens_seen": 842746624,
"step": 823000
},
{
"epoch": 7.598195255625986,
"grad_norm": 0.88917076587677,
"learning_rate": 1.2009069855417462e-05,
"loss": 0.0176,
"num_input_tokens_seen": 843258624,
"step": 823500
},
{
"epoch": 7.602808610365285,
"grad_norm": 0.7236852049827576,
"learning_rate": 1.1986003081720966e-05,
"loss": 0.0178,
"num_input_tokens_seen": 843770624,
"step": 824000
},
{
"epoch": 7.607421965104585,
"grad_norm": 2.4100208282470703,
"learning_rate": 1.196293630802447e-05,
"loss": 0.0183,
"num_input_tokens_seen": 844282624,
"step": 824500
},
{
"epoch": 7.612035319843884,
"grad_norm": 0.9818079471588135,
"learning_rate": 1.1939869534327972e-05,
"loss": 0.0184,
"num_input_tokens_seen": 844794624,
"step": 825000
},
{
"epoch": 7.616648674583184,
"grad_norm": 5.109523773193359,
"learning_rate": 1.1916802760631476e-05,
"loss": 0.0182,
"num_input_tokens_seen": 845306624,
"step": 825500
},
{
"epoch": 7.621262029322483,
"grad_norm": 1.1535288095474243,
"learning_rate": 1.189373598693498e-05,
"loss": 0.018,
"num_input_tokens_seen": 845818624,
"step": 826000
},
{
"epoch": 7.625875384061782,
"grad_norm": 1.0759390592575073,
"learning_rate": 1.1870669213238483e-05,
"loss": 0.018,
"num_input_tokens_seen": 846330624,
"step": 826500
},
{
"epoch": 7.630488738801081,
"grad_norm": 0.9492645263671875,
"learning_rate": 1.1847602439541987e-05,
"loss": 0.0162,
"num_input_tokens_seen": 846842624,
"step": 827000
},
{
"epoch": 7.635102093540381,
"grad_norm": 0.5077918767929077,
"learning_rate": 1.182453566584549e-05,
"loss": 0.018,
"num_input_tokens_seen": 847354624,
"step": 827500
},
{
"epoch": 7.63971544827968,
"grad_norm": 0.5069125890731812,
"learning_rate": 1.1801468892148995e-05,
"loss": 0.0181,
"num_input_tokens_seen": 847866624,
"step": 828000
},
{
"epoch": 7.644328803018979,
"grad_norm": 0.35941779613494873,
"learning_rate": 1.1778402118452497e-05,
"loss": 0.0187,
"num_input_tokens_seen": 848378624,
"step": 828500
},
{
"epoch": 7.648942157758278,
"grad_norm": 0.7320166230201721,
"learning_rate": 1.1755335344756001e-05,
"loss": 0.0166,
"num_input_tokens_seen": 848890624,
"step": 829000
},
{
"epoch": 7.653555512497578,
"grad_norm": 0.4909152686595917,
"learning_rate": 1.1732268571059505e-05,
"loss": 0.0174,
"num_input_tokens_seen": 849402624,
"step": 829500
},
{
"epoch": 7.658168867236878,
"grad_norm": 0.5299736857414246,
"learning_rate": 1.1709201797363007e-05,
"loss": 0.017,
"num_input_tokens_seen": 849914624,
"step": 830000
},
{
"epoch": 7.662782221976177,
"grad_norm": 1.6265432834625244,
"learning_rate": 1.168613502366651e-05,
"loss": 0.0192,
"num_input_tokens_seen": 850426624,
"step": 830500
},
{
"epoch": 7.667395576715476,
"grad_norm": 1.0842050313949585,
"learning_rate": 1.1663068249970013e-05,
"loss": 0.0174,
"num_input_tokens_seen": 850938624,
"step": 831000
},
{
"epoch": 7.6720089314547755,
"grad_norm": 0.46629172563552856,
"learning_rate": 1.1640001476273517e-05,
"loss": 0.019,
"num_input_tokens_seen": 851450624,
"step": 831500
},
{
"epoch": 7.676622286194075,
"grad_norm": 0.786178469657898,
"learning_rate": 1.161693470257702e-05,
"loss": 0.0174,
"num_input_tokens_seen": 851962624,
"step": 832000
},
{
"epoch": 7.681235640933374,
"grad_norm": 0.9928342700004578,
"learning_rate": 1.1593867928880524e-05,
"loss": 0.0187,
"num_input_tokens_seen": 852474624,
"step": 832500
},
{
"epoch": 7.685848995672673,
"grad_norm": 0.19910675287246704,
"learning_rate": 1.1570801155184028e-05,
"loss": 0.0165,
"num_input_tokens_seen": 852986624,
"step": 833000
},
{
"epoch": 7.6904623504119725,
"grad_norm": 0.44422009587287903,
"learning_rate": 1.1547734381487532e-05,
"loss": 0.019,
"num_input_tokens_seen": 853498624,
"step": 833500
},
{
"epoch": 7.695075705151272,
"grad_norm": 1.4326293468475342,
"learning_rate": 1.1524667607791034e-05,
"loss": 0.0189,
"num_input_tokens_seen": 854010624,
"step": 834000
},
{
"epoch": 7.699689059890571,
"grad_norm": 2.208235263824463,
"learning_rate": 1.1501600834094538e-05,
"loss": 0.0181,
"num_input_tokens_seen": 854522624,
"step": 834500
},
{
"epoch": 7.70430241462987,
"grad_norm": 1.5056183338165283,
"learning_rate": 1.1478534060398042e-05,
"loss": 0.0164,
"num_input_tokens_seen": 855034624,
"step": 835000
},
{
"epoch": 7.70891576936917,
"grad_norm": 0.991448700428009,
"learning_rate": 1.1455467286701544e-05,
"loss": 0.0188,
"num_input_tokens_seen": 855546624,
"step": 835500
},
{
"epoch": 7.71352912410847,
"grad_norm": 0.48746320605278015,
"learning_rate": 1.1432400513005048e-05,
"loss": 0.0187,
"num_input_tokens_seen": 856058624,
"step": 836000
},
{
"epoch": 7.718142478847769,
"grad_norm": 0.7954283356666565,
"learning_rate": 1.140933373930855e-05,
"loss": 0.0186,
"num_input_tokens_seen": 856570624,
"step": 836500
},
{
"epoch": 7.722755833587068,
"grad_norm": 0.3314274251461029,
"learning_rate": 1.1386266965612054e-05,
"loss": 0.0183,
"num_input_tokens_seen": 857082624,
"step": 837000
},
{
"epoch": 7.727369188326367,
"grad_norm": 0.40846577286720276,
"learning_rate": 1.1363200191915556e-05,
"loss": 0.0188,
"num_input_tokens_seen": 857594624,
"step": 837500
},
{
"epoch": 7.731982543065667,
"grad_norm": 0.5026475787162781,
"learning_rate": 1.134013341821906e-05,
"loss": 0.0174,
"num_input_tokens_seen": 858106624,
"step": 838000
},
{
"epoch": 7.736595897804966,
"grad_norm": 0.7746123671531677,
"learning_rate": 1.1317066644522564e-05,
"loss": 0.0195,
"num_input_tokens_seen": 858618624,
"step": 838500
},
{
"epoch": 7.741209252544265,
"grad_norm": 0.835455060005188,
"learning_rate": 1.1293999870826068e-05,
"loss": 0.0183,
"num_input_tokens_seen": 859130624,
"step": 839000
},
{
"epoch": 7.745822607283564,
"grad_norm": 1.107001781463623,
"learning_rate": 1.127093309712957e-05,
"loss": 0.0201,
"num_input_tokens_seen": 859642624,
"step": 839500
},
{
"epoch": 7.750435962022864,
"grad_norm": 0.31434282660484314,
"learning_rate": 1.1247866323433075e-05,
"loss": 0.0195,
"num_input_tokens_seen": 860154624,
"step": 840000
},
{
"epoch": 7.755049316762163,
"grad_norm": 0.7980784773826599,
"learning_rate": 1.1224799549736579e-05,
"loss": 0.0173,
"num_input_tokens_seen": 860666624,
"step": 840500
},
{
"epoch": 7.759662671501462,
"grad_norm": 0.6341221332550049,
"learning_rate": 1.1201732776040081e-05,
"loss": 0.0156,
"num_input_tokens_seen": 861178624,
"step": 841000
},
{
"epoch": 7.764276026240761,
"grad_norm": 1.298004388809204,
"learning_rate": 1.1178666002343585e-05,
"loss": 0.0182,
"num_input_tokens_seen": 861690624,
"step": 841500
},
{
"epoch": 7.768889380980061,
"grad_norm": 0.6212522983551025,
"learning_rate": 1.1155599228647089e-05,
"loss": 0.0183,
"num_input_tokens_seen": 862202624,
"step": 842000
},
{
"epoch": 7.773502735719361,
"grad_norm": 1.0448174476623535,
"learning_rate": 1.1132532454950593e-05,
"loss": 0.0167,
"num_input_tokens_seen": 862714624,
"step": 842500
},
{
"epoch": 7.77811609045866,
"grad_norm": 0.4349260628223419,
"learning_rate": 1.1109465681254095e-05,
"loss": 0.0182,
"num_input_tokens_seen": 863226624,
"step": 843000
},
{
"epoch": 7.782729445197959,
"grad_norm": 0.5279752016067505,
"learning_rate": 1.1086398907557597e-05,
"loss": 0.0191,
"num_input_tokens_seen": 863738624,
"step": 843500
},
{
"epoch": 7.787342799937258,
"grad_norm": 2.5519967079162598,
"learning_rate": 1.1063332133861101e-05,
"loss": 0.0188,
"num_input_tokens_seen": 864250624,
"step": 844000
},
{
"epoch": 7.791956154676558,
"grad_norm": 1.002515435218811,
"learning_rate": 1.1040265360164605e-05,
"loss": 0.0181,
"num_input_tokens_seen": 864762624,
"step": 844500
},
{
"epoch": 7.796569509415857,
"grad_norm": 1.0723029375076294,
"learning_rate": 1.1017198586468108e-05,
"loss": 0.0172,
"num_input_tokens_seen": 865274624,
"step": 845000
},
{
"epoch": 7.801182864155156,
"grad_norm": 0.492806613445282,
"learning_rate": 1.0994131812771612e-05,
"loss": 0.0184,
"num_input_tokens_seen": 865786624,
"step": 845500
},
{
"epoch": 7.805796218894455,
"grad_norm": 2.1584246158599854,
"learning_rate": 1.0971065039075116e-05,
"loss": 0.0195,
"num_input_tokens_seen": 866298624,
"step": 846000
},
{
"epoch": 7.810409573633755,
"grad_norm": 0.9871762990951538,
"learning_rate": 1.094799826537862e-05,
"loss": 0.0172,
"num_input_tokens_seen": 866810624,
"step": 846500
},
{
"epoch": 7.815022928373054,
"grad_norm": 1.234832525253296,
"learning_rate": 1.0924931491682122e-05,
"loss": 0.0178,
"num_input_tokens_seen": 867322624,
"step": 847000
},
{
"epoch": 7.819636283112354,
"grad_norm": 0.8536167144775391,
"learning_rate": 1.0901864717985626e-05,
"loss": 0.0189,
"num_input_tokens_seen": 867834624,
"step": 847500
},
{
"epoch": 7.824249637851653,
"grad_norm": 0.5045762658119202,
"learning_rate": 1.087879794428913e-05,
"loss": 0.0165,
"num_input_tokens_seen": 868346624,
"step": 848000
},
{
"epoch": 7.8288629925909525,
"grad_norm": 0.539504885673523,
"learning_rate": 1.0855731170592632e-05,
"loss": 0.0189,
"num_input_tokens_seen": 868858624,
"step": 848500
},
{
"epoch": 7.833476347330252,
"grad_norm": 0.6124027967453003,
"learning_rate": 1.0832664396896136e-05,
"loss": 0.0181,
"num_input_tokens_seen": 869370624,
"step": 849000
},
{
"epoch": 7.838089702069551,
"grad_norm": 0.5063890814781189,
"learning_rate": 1.0809597623199638e-05,
"loss": 0.0166,
"num_input_tokens_seen": 869882624,
"step": 849500
},
{
"epoch": 7.84270305680885,
"grad_norm": 0.4935370087623596,
"learning_rate": 1.0786530849503142e-05,
"loss": 0.0182,
"num_input_tokens_seen": 870394624,
"step": 850000
},
{
"epoch": 7.8473164115481495,
"grad_norm": 1.3337877988815308,
"learning_rate": 1.0763464075806644e-05,
"loss": 0.0167,
"num_input_tokens_seen": 870906624,
"step": 850500
},
{
"epoch": 7.851929766287449,
"grad_norm": 0.5984758734703064,
"learning_rate": 1.0740397302110148e-05,
"loss": 0.0181,
"num_input_tokens_seen": 871418624,
"step": 851000
},
{
"epoch": 7.856543121026748,
"grad_norm": 0.6499104499816895,
"learning_rate": 1.0717330528413652e-05,
"loss": 0.0176,
"num_input_tokens_seen": 871930624,
"step": 851500
},
{
"epoch": 7.861156475766047,
"grad_norm": 0.5723326206207275,
"learning_rate": 1.0694263754717156e-05,
"loss": 0.0173,
"num_input_tokens_seen": 872442624,
"step": 852000
},
{
"epoch": 7.8657698305053465,
"grad_norm": 0.6458103060722351,
"learning_rate": 1.0671196981020659e-05,
"loss": 0.0185,
"num_input_tokens_seen": 872954624,
"step": 852500
},
{
"epoch": 7.870383185244647,
"grad_norm": 0.6607184410095215,
"learning_rate": 1.0648130207324163e-05,
"loss": 0.0174,
"num_input_tokens_seen": 873466624,
"step": 853000
},
{
"epoch": 7.874996539983946,
"grad_norm": 0.7945510745048523,
"learning_rate": 1.0625063433627667e-05,
"loss": 0.0178,
"num_input_tokens_seen": 873978624,
"step": 853500
},
{
"epoch": 7.879609894723245,
"grad_norm": 0.9480940103530884,
"learning_rate": 1.0601996659931169e-05,
"loss": 0.019,
"num_input_tokens_seen": 874490624,
"step": 854000
},
{
"epoch": 7.884223249462544,
"grad_norm": 0.5195125937461853,
"learning_rate": 1.0578929886234673e-05,
"loss": 0.017,
"num_input_tokens_seen": 875002624,
"step": 854500
},
{
"epoch": 7.888836604201844,
"grad_norm": 0.3116241693496704,
"learning_rate": 1.0555863112538177e-05,
"loss": 0.0189,
"num_input_tokens_seen": 875514624,
"step": 855000
},
{
"epoch": 7.893449958941143,
"grad_norm": 0.8278101086616516,
"learning_rate": 1.053279633884168e-05,
"loss": 0.0177,
"num_input_tokens_seen": 876026624,
"step": 855500
},
{
"epoch": 7.898063313680442,
"grad_norm": 0.6848555207252502,
"learning_rate": 1.0509729565145181e-05,
"loss": 0.0186,
"num_input_tokens_seen": 876538624,
"step": 856000
},
{
"epoch": 7.902676668419741,
"grad_norm": 0.9749637842178345,
"learning_rate": 1.0486662791448685e-05,
"loss": 0.0214,
"num_input_tokens_seen": 877050624,
"step": 856500
},
{
"epoch": 7.907290023159041,
"grad_norm": 2.486924648284912,
"learning_rate": 1.046359601775219e-05,
"loss": 0.0194,
"num_input_tokens_seen": 877562624,
"step": 857000
},
{
"epoch": 7.91190337789834,
"grad_norm": 0.8250918388366699,
"learning_rate": 1.0440529244055693e-05,
"loss": 0.0187,
"num_input_tokens_seen": 878074624,
"step": 857500
},
{
"epoch": 7.916516732637639,
"grad_norm": 1.9874022006988525,
"learning_rate": 1.0417462470359196e-05,
"loss": 0.0181,
"num_input_tokens_seen": 878586624,
"step": 858000
},
{
"epoch": 7.921130087376939,
"grad_norm": 1.451173186302185,
"learning_rate": 1.03943956966627e-05,
"loss": 0.0191,
"num_input_tokens_seen": 879098624,
"step": 858500
},
{
"epoch": 7.925743442116238,
"grad_norm": 3.8313064575195312,
"learning_rate": 1.0371328922966204e-05,
"loss": 0.0188,
"num_input_tokens_seen": 879610624,
"step": 859000
},
{
"epoch": 7.930356796855538,
"grad_norm": 0.9106965661048889,
"learning_rate": 1.0348262149269706e-05,
"loss": 0.0164,
"num_input_tokens_seen": 880122624,
"step": 859500
},
{
"epoch": 7.934970151594837,
"grad_norm": 0.9856759905815125,
"learning_rate": 1.032519537557321e-05,
"loss": 0.0189,
"num_input_tokens_seen": 880634624,
"step": 860000
},
{
"epoch": 7.939583506334136,
"grad_norm": 1.1179744005203247,
"learning_rate": 1.0302128601876714e-05,
"loss": 0.0186,
"num_input_tokens_seen": 881146624,
"step": 860500
},
{
"epoch": 7.9441968610734355,
"grad_norm": 0.8333301544189453,
"learning_rate": 1.0279061828180218e-05,
"loss": 0.0182,
"num_input_tokens_seen": 881658624,
"step": 861000
},
{
"epoch": 7.948810215812735,
"grad_norm": 0.4756206274032593,
"learning_rate": 1.025599505448372e-05,
"loss": 0.0187,
"num_input_tokens_seen": 882170624,
"step": 861500
},
{
"epoch": 7.953423570552034,
"grad_norm": 1.3627671003341675,
"learning_rate": 1.0232928280787224e-05,
"loss": 0.018,
"num_input_tokens_seen": 882682624,
"step": 862000
},
{
"epoch": 7.958036925291333,
"grad_norm": 1.3066837787628174,
"learning_rate": 1.0209861507090726e-05,
"loss": 0.0193,
"num_input_tokens_seen": 883194624,
"step": 862500
},
{
"epoch": 7.9626502800306325,
"grad_norm": 0.46038496494293213,
"learning_rate": 1.018679473339423e-05,
"loss": 0.018,
"num_input_tokens_seen": 883706624,
"step": 863000
},
{
"epoch": 7.967263634769932,
"grad_norm": 0.67403644323349,
"learning_rate": 1.0163727959697732e-05,
"loss": 0.0188,
"num_input_tokens_seen": 884218624,
"step": 863500
},
{
"epoch": 7.971876989509231,
"grad_norm": 0.7785734534263611,
"learning_rate": 1.0140661186001236e-05,
"loss": 0.0169,
"num_input_tokens_seen": 884730624,
"step": 864000
},
{
"epoch": 7.97649034424853,
"grad_norm": 0.8497280478477478,
"learning_rate": 1.011759441230474e-05,
"loss": 0.0189,
"num_input_tokens_seen": 885242624,
"step": 864500
},
{
"epoch": 7.98110369898783,
"grad_norm": 4.073908805847168,
"learning_rate": 1.0094527638608243e-05,
"loss": 0.0183,
"num_input_tokens_seen": 885754624,
"step": 865000
},
{
"epoch": 7.98571705372713,
"grad_norm": 0.7901633977890015,
"learning_rate": 1.0071460864911747e-05,
"loss": 0.0177,
"num_input_tokens_seen": 886266624,
"step": 865500
},
{
"epoch": 7.990330408466429,
"grad_norm": 2.1585545539855957,
"learning_rate": 1.004839409121525e-05,
"loss": 0.0172,
"num_input_tokens_seen": 886778624,
"step": 866000
},
{
"epoch": 7.994943763205728,
"grad_norm": 0.6002645492553711,
"learning_rate": 1.0025327317518755e-05,
"loss": 0.0193,
"num_input_tokens_seen": 887290624,
"step": 866500
},
{
"epoch": 7.999557117945027,
"grad_norm": 0.602433443069458,
"learning_rate": 1.0002260543822257e-05,
"loss": 0.0182,
"num_input_tokens_seen": 887802624,
"step": 867000
},
{
"epoch": 8.0,
"eval_combined_score": 0.0675718570300666,
"eval_loss": 0.06757185608148575,
"eval_mse": 0.06757185797864745,
"eval_runtime": 46.9325,
"eval_samples_per_second": 2052.691,
"eval_steps_per_second": 256.602,
"num_input_tokens_seen": 887851008,
"step": 867048
},
{
"epoch": 8.004170472684327,
"grad_norm": 1.5062319040298462,
"learning_rate": 9.97919377012576e-06,
"loss": 0.0154,
"num_input_tokens_seen": 888313856,
"step": 867500
},
{
"epoch": 8.008783827423626,
"grad_norm": 0.3685579001903534,
"learning_rate": 9.956126996429265e-06,
"loss": 0.0156,
"num_input_tokens_seen": 888825856,
"step": 868000
},
{
"epoch": 8.013397182162926,
"grad_norm": 0.5031562447547913,
"learning_rate": 9.933060222732767e-06,
"loss": 0.0147,
"num_input_tokens_seen": 889337856,
"step": 868500
},
{
"epoch": 8.018010536902224,
"grad_norm": 1.041576623916626,
"learning_rate": 9.90999344903627e-06,
"loss": 0.0158,
"num_input_tokens_seen": 889849856,
"step": 869000
},
{
"epoch": 8.022623891641524,
"grad_norm": 0.6168863773345947,
"learning_rate": 9.886926675339773e-06,
"loss": 0.0149,
"num_input_tokens_seen": 890361856,
"step": 869500
},
{
"epoch": 8.027237246380823,
"grad_norm": 1.0457834005355835,
"learning_rate": 9.863859901643277e-06,
"loss": 0.0155,
"num_input_tokens_seen": 890873856,
"step": 870000
},
{
"epoch": 8.031850601120123,
"grad_norm": 0.7371172904968262,
"learning_rate": 9.840793127946781e-06,
"loss": 0.0156,
"num_input_tokens_seen": 891385856,
"step": 870500
},
{
"epoch": 8.036463955859421,
"grad_norm": 1.857638955116272,
"learning_rate": 9.817726354250284e-06,
"loss": 0.0136,
"num_input_tokens_seen": 891897856,
"step": 871000
},
{
"epoch": 8.041077310598721,
"grad_norm": 1.3631207942962646,
"learning_rate": 9.794659580553788e-06,
"loss": 0.0157,
"num_input_tokens_seen": 892409856,
"step": 871500
},
{
"epoch": 8.04569066533802,
"grad_norm": 1.4387595653533936,
"learning_rate": 9.771592806857291e-06,
"loss": 0.0155,
"num_input_tokens_seen": 892921856,
"step": 872000
},
{
"epoch": 8.05030402007732,
"grad_norm": 0.44265180826187134,
"learning_rate": 9.748526033160794e-06,
"loss": 0.0151,
"num_input_tokens_seen": 893433856,
"step": 872500
},
{
"epoch": 8.054917374816618,
"grad_norm": 0.7352337837219238,
"learning_rate": 9.725459259464298e-06,
"loss": 0.0156,
"num_input_tokens_seen": 893945856,
"step": 873000
},
{
"epoch": 8.059530729555918,
"grad_norm": 0.6806060075759888,
"learning_rate": 9.702392485767802e-06,
"loss": 0.0139,
"num_input_tokens_seen": 894457856,
"step": 873500
},
{
"epoch": 8.064144084295219,
"grad_norm": 0.7403847575187683,
"learning_rate": 9.679325712071306e-06,
"loss": 0.0164,
"num_input_tokens_seen": 894969856,
"step": 874000
},
{
"epoch": 8.068757439034517,
"grad_norm": 1.1141221523284912,
"learning_rate": 9.656258938374808e-06,
"loss": 0.0148,
"num_input_tokens_seen": 895481856,
"step": 874500
},
{
"epoch": 8.073370793773817,
"grad_norm": 0.983514130115509,
"learning_rate": 9.633192164678312e-06,
"loss": 0.0154,
"num_input_tokens_seen": 895993856,
"step": 875000
},
{
"epoch": 8.077984148513115,
"grad_norm": 0.4191863536834717,
"learning_rate": 9.610125390981814e-06,
"loss": 0.0171,
"num_input_tokens_seen": 896505856,
"step": 875500
},
{
"epoch": 8.082597503252416,
"grad_norm": 0.4481130540370941,
"learning_rate": 9.587058617285318e-06,
"loss": 0.0134,
"num_input_tokens_seen": 897017856,
"step": 876000
},
{
"epoch": 8.087210857991714,
"grad_norm": 0.7153156995773315,
"learning_rate": 9.56399184358882e-06,
"loss": 0.0152,
"num_input_tokens_seen": 897529856,
"step": 876500
},
{
"epoch": 8.091824212731014,
"grad_norm": 1.7068063020706177,
"learning_rate": 9.540925069892324e-06,
"loss": 0.0163,
"num_input_tokens_seen": 898041856,
"step": 877000
},
{
"epoch": 8.096437567470312,
"grad_norm": 0.5899567008018494,
"learning_rate": 9.517858296195828e-06,
"loss": 0.0158,
"num_input_tokens_seen": 898553856,
"step": 877500
},
{
"epoch": 8.101050922209613,
"grad_norm": 0.9179006218910217,
"learning_rate": 9.49479152249933e-06,
"loss": 0.0143,
"num_input_tokens_seen": 899065856,
"step": 878000
},
{
"epoch": 8.105664276948911,
"grad_norm": 0.7641995549201965,
"learning_rate": 9.471724748802835e-06,
"loss": 0.0158,
"num_input_tokens_seen": 899577856,
"step": 878500
},
{
"epoch": 8.110277631688211,
"grad_norm": 0.8679375648498535,
"learning_rate": 9.448657975106339e-06,
"loss": 0.0156,
"num_input_tokens_seen": 900089856,
"step": 879000
},
{
"epoch": 8.114890986427511,
"grad_norm": 0.981959342956543,
"learning_rate": 9.425591201409843e-06,
"loss": 0.0163,
"num_input_tokens_seen": 900601856,
"step": 879500
},
{
"epoch": 8.11950434116681,
"grad_norm": 0.5581063032150269,
"learning_rate": 9.402524427713345e-06,
"loss": 0.0153,
"num_input_tokens_seen": 901113856,
"step": 880000
},
{
"epoch": 8.12411769590611,
"grad_norm": 0.4459242522716522,
"learning_rate": 9.379457654016849e-06,
"loss": 0.0139,
"num_input_tokens_seen": 901625856,
"step": 880500
},
{
"epoch": 8.128731050645408,
"grad_norm": 0.5052184462547302,
"learning_rate": 9.356390880320353e-06,
"loss": 0.0137,
"num_input_tokens_seen": 902137856,
"step": 881000
},
{
"epoch": 8.133344405384708,
"grad_norm": 2.326282024383545,
"learning_rate": 9.333324106623855e-06,
"loss": 0.0145,
"num_input_tokens_seen": 902649856,
"step": 881500
},
{
"epoch": 8.137957760124007,
"grad_norm": 0.3621096909046173,
"learning_rate": 9.310257332927357e-06,
"loss": 0.015,
"num_input_tokens_seen": 903161856,
"step": 882000
},
{
"epoch": 8.142571114863307,
"grad_norm": 0.599589467048645,
"learning_rate": 9.287190559230861e-06,
"loss": 0.0133,
"num_input_tokens_seen": 903673856,
"step": 882500
},
{
"epoch": 8.147184469602605,
"grad_norm": 0.6334195733070374,
"learning_rate": 9.264123785534365e-06,
"loss": 0.014,
"num_input_tokens_seen": 904185856,
"step": 883000
},
{
"epoch": 8.151797824341905,
"grad_norm": 0.5166653990745544,
"learning_rate": 9.241057011837868e-06,
"loss": 0.0159,
"num_input_tokens_seen": 904697856,
"step": 883500
},
{
"epoch": 8.156411179081203,
"grad_norm": 0.9574226140975952,
"learning_rate": 9.217990238141372e-06,
"loss": 0.0147,
"num_input_tokens_seen": 905209856,
"step": 884000
},
{
"epoch": 8.161024533820504,
"grad_norm": 0.7625335454940796,
"learning_rate": 9.194923464444876e-06,
"loss": 0.0172,
"num_input_tokens_seen": 905721856,
"step": 884500
},
{
"epoch": 8.165637888559804,
"grad_norm": 0.5956442356109619,
"learning_rate": 9.17185669074838e-06,
"loss": 0.0146,
"num_input_tokens_seen": 906233856,
"step": 885000
},
{
"epoch": 8.170251243299102,
"grad_norm": 0.6293473243713379,
"learning_rate": 9.148789917051882e-06,
"loss": 0.0142,
"num_input_tokens_seen": 906745856,
"step": 885500
},
{
"epoch": 8.174864598038402,
"grad_norm": 1.834021806716919,
"learning_rate": 9.125723143355386e-06,
"loss": 0.0148,
"num_input_tokens_seen": 907257856,
"step": 886000
},
{
"epoch": 8.1794779527777,
"grad_norm": 0.4335891008377075,
"learning_rate": 9.10265636965889e-06,
"loss": 0.0161,
"num_input_tokens_seen": 907769856,
"step": 886500
},
{
"epoch": 8.184091307517,
"grad_norm": 0.573677659034729,
"learning_rate": 9.079589595962392e-06,
"loss": 0.0142,
"num_input_tokens_seen": 908281856,
"step": 887000
},
{
"epoch": 8.1887046622563,
"grad_norm": 0.7976333498954773,
"learning_rate": 9.056522822265896e-06,
"loss": 0.016,
"num_input_tokens_seen": 908793856,
"step": 887500
},
{
"epoch": 8.1933180169956,
"grad_norm": 1.0269770622253418,
"learning_rate": 9.0334560485694e-06,
"loss": 0.0155,
"num_input_tokens_seen": 909305856,
"step": 888000
},
{
"epoch": 8.197931371734898,
"grad_norm": 0.9196085333824158,
"learning_rate": 9.010389274872902e-06,
"loss": 0.0161,
"num_input_tokens_seen": 909817856,
"step": 888500
},
{
"epoch": 8.202544726474198,
"grad_norm": 0.9371418952941895,
"learning_rate": 8.987322501176406e-06,
"loss": 0.0169,
"num_input_tokens_seen": 910329856,
"step": 889000
},
{
"epoch": 8.207158081213496,
"grad_norm": 0.5787968635559082,
"learning_rate": 8.964255727479908e-06,
"loss": 0.0151,
"num_input_tokens_seen": 910841856,
"step": 889500
},
{
"epoch": 8.211771435952796,
"grad_norm": 0.44304850697517395,
"learning_rate": 8.941188953783412e-06,
"loss": 0.0151,
"num_input_tokens_seen": 911353856,
"step": 890000
},
{
"epoch": 8.216384790692096,
"grad_norm": 1.7044280767440796,
"learning_rate": 8.918122180086916e-06,
"loss": 0.0165,
"num_input_tokens_seen": 911865856,
"step": 890500
},
{
"epoch": 8.220998145431395,
"grad_norm": 0.6133010983467102,
"learning_rate": 8.895055406390419e-06,
"loss": 0.0157,
"num_input_tokens_seen": 912377856,
"step": 891000
},
{
"epoch": 8.225611500170695,
"grad_norm": 2.290767192840576,
"learning_rate": 8.871988632693923e-06,
"loss": 0.016,
"num_input_tokens_seen": 912889856,
"step": 891500
},
{
"epoch": 8.230224854909993,
"grad_norm": 0.47266674041748047,
"learning_rate": 8.848921858997427e-06,
"loss": 0.0152,
"num_input_tokens_seen": 913401856,
"step": 892000
},
{
"epoch": 8.234838209649293,
"grad_norm": 0.7107419967651367,
"learning_rate": 8.82585508530093e-06,
"loss": 0.0148,
"num_input_tokens_seen": 913913856,
"step": 892500
},
{
"epoch": 8.239451564388592,
"grad_norm": 0.29794007539749146,
"learning_rate": 8.802788311604433e-06,
"loss": 0.0158,
"num_input_tokens_seen": 914425856,
"step": 893000
},
{
"epoch": 8.244064919127892,
"grad_norm": 0.9938859939575195,
"learning_rate": 8.779721537907937e-06,
"loss": 0.0138,
"num_input_tokens_seen": 914937856,
"step": 893500
},
{
"epoch": 8.24867827386719,
"grad_norm": 0.9996763467788696,
"learning_rate": 8.75665476421144e-06,
"loss": 0.0163,
"num_input_tokens_seen": 915449856,
"step": 894000
},
{
"epoch": 8.25329162860649,
"grad_norm": 0.8853555917739868,
"learning_rate": 8.733587990514943e-06,
"loss": 0.016,
"num_input_tokens_seen": 915961856,
"step": 894500
},
{
"epoch": 8.257904983345789,
"grad_norm": 0.5720754861831665,
"learning_rate": 8.710521216818445e-06,
"loss": 0.0147,
"num_input_tokens_seen": 916473856,
"step": 895000
},
{
"epoch": 8.262518338085089,
"grad_norm": 0.7386252880096436,
"learning_rate": 8.68745444312195e-06,
"loss": 0.0156,
"num_input_tokens_seen": 916985856,
"step": 895500
},
{
"epoch": 8.267131692824387,
"grad_norm": 0.5073798298835754,
"learning_rate": 8.664387669425453e-06,
"loss": 0.0152,
"num_input_tokens_seen": 917497856,
"step": 896000
},
{
"epoch": 8.271745047563687,
"grad_norm": 2.3658652305603027,
"learning_rate": 8.641320895728956e-06,
"loss": 0.0173,
"num_input_tokens_seen": 918009856,
"step": 896500
},
{
"epoch": 8.276358402302987,
"grad_norm": 0.4761596620082855,
"learning_rate": 8.61825412203246e-06,
"loss": 0.0145,
"num_input_tokens_seen": 918521856,
"step": 897000
},
{
"epoch": 8.280971757042286,
"grad_norm": 0.5883774161338806,
"learning_rate": 8.595187348335963e-06,
"loss": 0.0153,
"num_input_tokens_seen": 919033856,
"step": 897500
},
{
"epoch": 8.285585111781586,
"grad_norm": 0.9515103101730347,
"learning_rate": 8.572120574639467e-06,
"loss": 0.015,
"num_input_tokens_seen": 919545856,
"step": 898000
},
{
"epoch": 8.290198466520884,
"grad_norm": 0.5109001398086548,
"learning_rate": 8.54905380094297e-06,
"loss": 0.0164,
"num_input_tokens_seen": 920057856,
"step": 898500
},
{
"epoch": 8.294811821260184,
"grad_norm": 0.8202781081199646,
"learning_rate": 8.525987027246474e-06,
"loss": 0.0151,
"num_input_tokens_seen": 920569856,
"step": 899000
},
{
"epoch": 8.299425175999483,
"grad_norm": 1.913580060005188,
"learning_rate": 8.502920253549978e-06,
"loss": 0.0155,
"num_input_tokens_seen": 921081856,
"step": 899500
},
{
"epoch": 8.304038530738783,
"grad_norm": 0.6409407258033752,
"learning_rate": 8.47985347985348e-06,
"loss": 0.013,
"num_input_tokens_seen": 921593856,
"step": 900000
},
{
"epoch": 8.308651885478081,
"grad_norm": 0.4128231108188629,
"learning_rate": 8.456786706156984e-06,
"loss": 0.0148,
"num_input_tokens_seen": 922105856,
"step": 900500
},
{
"epoch": 8.313265240217381,
"grad_norm": 2.3555517196655273,
"learning_rate": 8.433719932460488e-06,
"loss": 0.0142,
"num_input_tokens_seen": 922617856,
"step": 901000
},
{
"epoch": 8.31787859495668,
"grad_norm": 1.5205661058425903,
"learning_rate": 8.41065315876399e-06,
"loss": 0.016,
"num_input_tokens_seen": 923129856,
"step": 901500
},
{
"epoch": 8.32249194969598,
"grad_norm": 0.8352044224739075,
"learning_rate": 8.387586385067492e-06,
"loss": 0.0154,
"num_input_tokens_seen": 923641856,
"step": 902000
},
{
"epoch": 8.32710530443528,
"grad_norm": 0.256552129983902,
"learning_rate": 8.364519611370996e-06,
"loss": 0.0151,
"num_input_tokens_seen": 924153856,
"step": 902500
},
{
"epoch": 8.331718659174578,
"grad_norm": 0.9458514451980591,
"learning_rate": 8.3414528376745e-06,
"loss": 0.0162,
"num_input_tokens_seen": 924665856,
"step": 903000
},
{
"epoch": 8.336332013913879,
"grad_norm": 0.9356163740158081,
"learning_rate": 8.318386063978004e-06,
"loss": 0.0144,
"num_input_tokens_seen": 925177856,
"step": 903500
},
{
"epoch": 8.340945368653177,
"grad_norm": 0.6801881790161133,
"learning_rate": 8.295319290281507e-06,
"loss": 0.0143,
"num_input_tokens_seen": 925689856,
"step": 904000
},
{
"epoch": 8.345558723392477,
"grad_norm": 1.2119888067245483,
"learning_rate": 8.27225251658501e-06,
"loss": 0.0147,
"num_input_tokens_seen": 926201856,
"step": 904500
},
{
"epoch": 8.350172078131775,
"grad_norm": 0.6034347414970398,
"learning_rate": 8.249185742888515e-06,
"loss": 0.0147,
"num_input_tokens_seen": 926713856,
"step": 905000
},
{
"epoch": 8.354785432871076,
"grad_norm": 0.47974085807800293,
"learning_rate": 8.226118969192017e-06,
"loss": 0.0173,
"num_input_tokens_seen": 927225856,
"step": 905500
},
{
"epoch": 8.359398787610374,
"grad_norm": 0.7787156105041504,
"learning_rate": 8.20305219549552e-06,
"loss": 0.0161,
"num_input_tokens_seen": 927737856,
"step": 906000
},
{
"epoch": 8.364012142349674,
"grad_norm": 0.8252438306808472,
"learning_rate": 8.179985421799025e-06,
"loss": 0.0156,
"num_input_tokens_seen": 928249856,
"step": 906500
},
{
"epoch": 8.368625497088972,
"grad_norm": 1.7516320943832397,
"learning_rate": 8.156918648102529e-06,
"loss": 0.0142,
"num_input_tokens_seen": 928761856,
"step": 907000
},
{
"epoch": 8.373238851828273,
"grad_norm": 0.9089247584342957,
"learning_rate": 8.133851874406031e-06,
"loss": 0.0179,
"num_input_tokens_seen": 929273856,
"step": 907500
},
{
"epoch": 8.377852206567571,
"grad_norm": 0.5961917042732239,
"learning_rate": 8.110785100709533e-06,
"loss": 0.0147,
"num_input_tokens_seen": 929785856,
"step": 908000
},
{
"epoch": 8.382465561306871,
"grad_norm": 0.9045282602310181,
"learning_rate": 8.087718327013037e-06,
"loss": 0.015,
"num_input_tokens_seen": 930297856,
"step": 908500
},
{
"epoch": 8.387078916046171,
"grad_norm": 2.7716050148010254,
"learning_rate": 8.064651553316541e-06,
"loss": 0.0166,
"num_input_tokens_seen": 930809856,
"step": 909000
},
{
"epoch": 8.39169227078547,
"grad_norm": 0.8180987238883972,
"learning_rate": 8.041584779620044e-06,
"loss": 0.0142,
"num_input_tokens_seen": 931321856,
"step": 909500
},
{
"epoch": 8.39630562552477,
"grad_norm": 0.8871789574623108,
"learning_rate": 8.018518005923548e-06,
"loss": 0.014,
"num_input_tokens_seen": 931833856,
"step": 910000
},
{
"epoch": 8.400918980264068,
"grad_norm": 0.9161932468414307,
"learning_rate": 7.995451232227051e-06,
"loss": 0.0153,
"num_input_tokens_seen": 932345856,
"step": 910500
},
{
"epoch": 8.405532335003368,
"grad_norm": 0.6723649501800537,
"learning_rate": 7.972384458530555e-06,
"loss": 0.0167,
"num_input_tokens_seen": 932857856,
"step": 911000
},
{
"epoch": 8.410145689742667,
"grad_norm": 0.57211834192276,
"learning_rate": 7.949317684834058e-06,
"loss": 0.015,
"num_input_tokens_seen": 933369856,
"step": 911500
},
{
"epoch": 8.414759044481967,
"grad_norm": 0.7815681099891663,
"learning_rate": 7.926250911137562e-06,
"loss": 0.0155,
"num_input_tokens_seen": 933881856,
"step": 912000
},
{
"epoch": 8.419372399221265,
"grad_norm": 1.4835954904556274,
"learning_rate": 7.903184137441066e-06,
"loss": 0.0175,
"num_input_tokens_seen": 934393856,
"step": 912500
},
{
"epoch": 8.423985753960565,
"grad_norm": 0.6556302905082703,
"learning_rate": 7.880117363744568e-06,
"loss": 0.0157,
"num_input_tokens_seen": 934905856,
"step": 913000
},
{
"epoch": 8.428599108699864,
"grad_norm": 0.3592114746570587,
"learning_rate": 7.857050590048072e-06,
"loss": 0.0148,
"num_input_tokens_seen": 935417856,
"step": 913500
},
{
"epoch": 8.433212463439164,
"grad_norm": 1.0812350511550903,
"learning_rate": 7.833983816351576e-06,
"loss": 0.0152,
"num_input_tokens_seen": 935929856,
"step": 914000
},
{
"epoch": 8.437825818178464,
"grad_norm": 0.5357770919799805,
"learning_rate": 7.810917042655078e-06,
"loss": 0.0151,
"num_input_tokens_seen": 936441856,
"step": 914500
},
{
"epoch": 8.442439172917762,
"grad_norm": 1.2673269510269165,
"learning_rate": 7.78785026895858e-06,
"loss": 0.0145,
"num_input_tokens_seen": 936953856,
"step": 915000
},
{
"epoch": 8.447052527657062,
"grad_norm": 1.7254928350448608,
"learning_rate": 7.764783495262084e-06,
"loss": 0.0165,
"num_input_tokens_seen": 937465856,
"step": 915500
},
{
"epoch": 8.45166588239636,
"grad_norm": 0.740627646446228,
"learning_rate": 7.741716721565588e-06,
"loss": 0.0145,
"num_input_tokens_seen": 937977856,
"step": 916000
},
{
"epoch": 8.45627923713566,
"grad_norm": 0.8942471146583557,
"learning_rate": 7.718649947869092e-06,
"loss": 0.0164,
"num_input_tokens_seen": 938489856,
"step": 916500
},
{
"epoch": 8.46089259187496,
"grad_norm": 0.5979003310203552,
"learning_rate": 7.695583174172595e-06,
"loss": 0.0152,
"num_input_tokens_seen": 939001856,
"step": 917000
},
{
"epoch": 8.46550594661426,
"grad_norm": 0.690619945526123,
"learning_rate": 7.672516400476099e-06,
"loss": 0.014,
"num_input_tokens_seen": 939513856,
"step": 917500
},
{
"epoch": 8.470119301353558,
"grad_norm": 0.9563241004943848,
"learning_rate": 7.649449626779603e-06,
"loss": 0.0151,
"num_input_tokens_seen": 940025856,
"step": 918000
},
{
"epoch": 8.474732656092858,
"grad_norm": 0.7812721729278564,
"learning_rate": 7.626382853083106e-06,
"loss": 0.0169,
"num_input_tokens_seen": 940537856,
"step": 918500
},
{
"epoch": 8.479346010832156,
"grad_norm": 0.7864488959312439,
"learning_rate": 7.603316079386609e-06,
"loss": 0.0165,
"num_input_tokens_seen": 941049856,
"step": 919000
},
{
"epoch": 8.483959365571456,
"grad_norm": 0.41324466466903687,
"learning_rate": 7.580249305690113e-06,
"loss": 0.0148,
"num_input_tokens_seen": 941561856,
"step": 919500
},
{
"epoch": 8.488572720310756,
"grad_norm": 1.0213603973388672,
"learning_rate": 7.557182531993616e-06,
"loss": 0.0141,
"num_input_tokens_seen": 942073856,
"step": 920000
},
{
"epoch": 8.493186075050055,
"grad_norm": 0.9692112803459167,
"learning_rate": 7.53411575829712e-06,
"loss": 0.0162,
"num_input_tokens_seen": 942585856,
"step": 920500
},
{
"epoch": 8.497799429789355,
"grad_norm": 0.9468556642532349,
"learning_rate": 7.511048984600621e-06,
"loss": 0.015,
"num_input_tokens_seen": 943097856,
"step": 921000
},
{
"epoch": 8.502412784528653,
"grad_norm": 1.1541293859481812,
"learning_rate": 7.487982210904125e-06,
"loss": 0.0154,
"num_input_tokens_seen": 943609856,
"step": 921500
},
{
"epoch": 8.507026139267953,
"grad_norm": 0.6092996597290039,
"learning_rate": 7.464915437207628e-06,
"loss": 0.0145,
"num_input_tokens_seen": 944121856,
"step": 922000
},
{
"epoch": 8.511639494007252,
"grad_norm": 2.1357691287994385,
"learning_rate": 7.441848663511132e-06,
"loss": 0.0158,
"num_input_tokens_seen": 944633856,
"step": 922500
},
{
"epoch": 8.516252848746552,
"grad_norm": 0.8940873146057129,
"learning_rate": 7.4187818898146355e-06,
"loss": 0.0158,
"num_input_tokens_seen": 945145856,
"step": 923000
},
{
"epoch": 8.52086620348585,
"grad_norm": 0.44890737533569336,
"learning_rate": 7.395715116118139e-06,
"loss": 0.0183,
"num_input_tokens_seen": 945657856,
"step": 923500
},
{
"epoch": 8.52547955822515,
"grad_norm": 0.6357942223548889,
"learning_rate": 7.372648342421643e-06,
"loss": 0.0158,
"num_input_tokens_seen": 946169856,
"step": 924000
},
{
"epoch": 8.530092912964449,
"grad_norm": 1.20125150680542,
"learning_rate": 7.349581568725146e-06,
"loss": 0.0138,
"num_input_tokens_seen": 946681856,
"step": 924500
},
{
"epoch": 8.534706267703749,
"grad_norm": 1.3115291595458984,
"learning_rate": 7.32651479502865e-06,
"loss": 0.0156,
"num_input_tokens_seen": 947193856,
"step": 925000
},
{
"epoch": 8.539319622443049,
"grad_norm": 1.5604932308197021,
"learning_rate": 7.303448021332153e-06,
"loss": 0.0152,
"num_input_tokens_seen": 947705856,
"step": 925500
},
{
"epoch": 8.543932977182347,
"grad_norm": 0.5092642307281494,
"learning_rate": 7.280381247635657e-06,
"loss": 0.0159,
"num_input_tokens_seen": 948217856,
"step": 926000
},
{
"epoch": 8.548546331921647,
"grad_norm": 0.914828896522522,
"learning_rate": 7.25731447393916e-06,
"loss": 0.0138,
"num_input_tokens_seen": 948729856,
"step": 926500
},
{
"epoch": 8.553159686660946,
"grad_norm": 0.554459810256958,
"learning_rate": 7.234247700242663e-06,
"loss": 0.0155,
"num_input_tokens_seen": 949241856,
"step": 927000
},
{
"epoch": 8.557773041400246,
"grad_norm": 0.48894843459129333,
"learning_rate": 7.211180926546165e-06,
"loss": 0.0142,
"num_input_tokens_seen": 949753856,
"step": 927500
},
{
"epoch": 8.562386396139544,
"grad_norm": 1.2641159296035767,
"learning_rate": 7.188114152849669e-06,
"loss": 0.0163,
"num_input_tokens_seen": 950265856,
"step": 928000
},
{
"epoch": 8.566999750878844,
"grad_norm": 0.9658982157707214,
"learning_rate": 7.165047379153172e-06,
"loss": 0.0138,
"num_input_tokens_seen": 950777856,
"step": 928500
},
{
"epoch": 8.571613105618143,
"grad_norm": 1.2537494897842407,
"learning_rate": 7.141980605456676e-06,
"loss": 0.016,
"num_input_tokens_seen": 951289856,
"step": 929000
},
{
"epoch": 8.576226460357443,
"grad_norm": 2.147233009338379,
"learning_rate": 7.1189138317601795e-06,
"loss": 0.0155,
"num_input_tokens_seen": 951801856,
"step": 929500
},
{
"epoch": 8.580839815096741,
"grad_norm": 1.6873968839645386,
"learning_rate": 7.095847058063683e-06,
"loss": 0.0155,
"num_input_tokens_seen": 952313856,
"step": 930000
},
{
"epoch": 8.585453169836041,
"grad_norm": 1.5905687808990479,
"learning_rate": 7.072780284367187e-06,
"loss": 0.016,
"num_input_tokens_seen": 952825856,
"step": 930500
},
{
"epoch": 8.590066524575342,
"grad_norm": 0.8234834671020508,
"learning_rate": 7.04971351067069e-06,
"loss": 0.0142,
"num_input_tokens_seen": 953337856,
"step": 931000
},
{
"epoch": 8.59467987931464,
"grad_norm": 1.0002344846725464,
"learning_rate": 7.026646736974194e-06,
"loss": 0.0152,
"num_input_tokens_seen": 953849856,
"step": 931500
},
{
"epoch": 8.59929323405394,
"grad_norm": 4.079251289367676,
"learning_rate": 7.003579963277697e-06,
"loss": 0.0146,
"num_input_tokens_seen": 954361856,
"step": 932000
},
{
"epoch": 8.603906588793238,
"grad_norm": 0.8030288815498352,
"learning_rate": 6.980513189581201e-06,
"loss": 0.0145,
"num_input_tokens_seen": 954873856,
"step": 932500
},
{
"epoch": 8.608519943532539,
"grad_norm": 0.8186569213867188,
"learning_rate": 6.957446415884704e-06,
"loss": 0.0161,
"num_input_tokens_seen": 955385856,
"step": 933000
},
{
"epoch": 8.613133298271837,
"grad_norm": 0.680074155330658,
"learning_rate": 6.934379642188207e-06,
"loss": 0.0155,
"num_input_tokens_seen": 955897856,
"step": 933500
},
{
"epoch": 8.617746653011137,
"grad_norm": 1.1147595643997192,
"learning_rate": 6.911312868491709e-06,
"loss": 0.0171,
"num_input_tokens_seen": 956409856,
"step": 934000
},
{
"epoch": 8.622360007750435,
"grad_norm": 1.0557124614715576,
"learning_rate": 6.888246094795213e-06,
"loss": 0.0155,
"num_input_tokens_seen": 956921856,
"step": 934500
},
{
"epoch": 8.626973362489736,
"grad_norm": 0.5240976214408875,
"learning_rate": 6.865179321098716e-06,
"loss": 0.0144,
"num_input_tokens_seen": 957433856,
"step": 935000
},
{
"epoch": 8.631586717229034,
"grad_norm": 0.6534589529037476,
"learning_rate": 6.8421125474022195e-06,
"loss": 0.0167,
"num_input_tokens_seen": 957945856,
"step": 935500
},
{
"epoch": 8.636200071968334,
"grad_norm": 0.33386147022247314,
"learning_rate": 6.8190457737057235e-06,
"loss": 0.0128,
"num_input_tokens_seen": 958457856,
"step": 936000
},
{
"epoch": 8.640813426707634,
"grad_norm": 1.6744736433029175,
"learning_rate": 6.795979000009227e-06,
"loss": 0.0159,
"num_input_tokens_seen": 958969856,
"step": 936500
},
{
"epoch": 8.645426781446933,
"grad_norm": 6.504983425140381,
"learning_rate": 6.7729122263127306e-06,
"loss": 0.0182,
"num_input_tokens_seen": 959481856,
"step": 937000
},
{
"epoch": 8.650040136186233,
"grad_norm": 1.2921936511993408,
"learning_rate": 6.749845452616234e-06,
"loss": 0.0164,
"num_input_tokens_seen": 959993856,
"step": 937500
},
{
"epoch": 8.654653490925531,
"grad_norm": 1.5937762260437012,
"learning_rate": 6.726778678919738e-06,
"loss": 0.0156,
"num_input_tokens_seen": 960505856,
"step": 938000
},
{
"epoch": 8.659266845664831,
"grad_norm": 0.9005319476127625,
"learning_rate": 6.703711905223241e-06,
"loss": 0.0165,
"num_input_tokens_seen": 961017856,
"step": 938500
},
{
"epoch": 8.66388020040413,
"grad_norm": 1.019418716430664,
"learning_rate": 6.680645131526744e-06,
"loss": 0.0162,
"num_input_tokens_seen": 961529856,
"step": 939000
},
{
"epoch": 8.66849355514343,
"grad_norm": 0.5105811953544617,
"learning_rate": 6.657578357830248e-06,
"loss": 0.0152,
"num_input_tokens_seen": 962041856,
"step": 939500
},
{
"epoch": 8.673106909882728,
"grad_norm": 0.6588147282600403,
"learning_rate": 6.634511584133751e-06,
"loss": 0.0173,
"num_input_tokens_seen": 962553856,
"step": 940000
},
{
"epoch": 8.677720264622028,
"grad_norm": 0.5775207877159119,
"learning_rate": 6.611444810437253e-06,
"loss": 0.0158,
"num_input_tokens_seen": 963065856,
"step": 940500
},
{
"epoch": 8.682333619361327,
"grad_norm": 1.1807801723480225,
"learning_rate": 6.588378036740757e-06,
"loss": 0.015,
"num_input_tokens_seen": 963577856,
"step": 941000
},
{
"epoch": 8.686946974100627,
"grad_norm": 0.7394533157348633,
"learning_rate": 6.56531126304426e-06,
"loss": 0.0149,
"num_input_tokens_seen": 964089856,
"step": 941500
},
{
"epoch": 8.691560328839925,
"grad_norm": 0.5393823385238647,
"learning_rate": 6.5422444893477635e-06,
"loss": 0.0159,
"num_input_tokens_seen": 964601856,
"step": 942000
},
{
"epoch": 8.696173683579225,
"grad_norm": 1.1270785331726074,
"learning_rate": 6.5191777156512675e-06,
"loss": 0.0145,
"num_input_tokens_seen": 965113856,
"step": 942500
},
{
"epoch": 8.700787038318525,
"grad_norm": 1.156285047531128,
"learning_rate": 6.496110941954771e-06,
"loss": 0.0147,
"num_input_tokens_seen": 965625856,
"step": 943000
},
{
"epoch": 8.705400393057824,
"grad_norm": 0.3501507639884949,
"learning_rate": 6.4730441682582746e-06,
"loss": 0.0167,
"num_input_tokens_seen": 966137856,
"step": 943500
},
{
"epoch": 8.710013747797124,
"grad_norm": 0.7830114960670471,
"learning_rate": 6.449977394561778e-06,
"loss": 0.0155,
"num_input_tokens_seen": 966649856,
"step": 944000
},
{
"epoch": 8.714627102536422,
"grad_norm": 0.9424002766609192,
"learning_rate": 6.426910620865282e-06,
"loss": 0.0159,
"num_input_tokens_seen": 967161856,
"step": 944500
},
{
"epoch": 8.719240457275722,
"grad_norm": 1.7092015743255615,
"learning_rate": 6.403843847168785e-06,
"loss": 0.0159,
"num_input_tokens_seen": 967673856,
"step": 945000
},
{
"epoch": 8.72385381201502,
"grad_norm": 0.3808750808238983,
"learning_rate": 6.380777073472288e-06,
"loss": 0.0157,
"num_input_tokens_seen": 968185856,
"step": 945500
},
{
"epoch": 8.72846716675432,
"grad_norm": 0.8436591625213623,
"learning_rate": 6.357710299775792e-06,
"loss": 0.015,
"num_input_tokens_seen": 968697856,
"step": 946000
},
{
"epoch": 8.73308052149362,
"grad_norm": 0.48995792865753174,
"learning_rate": 6.334643526079295e-06,
"loss": 0.014,
"num_input_tokens_seen": 969209856,
"step": 946500
},
{
"epoch": 8.73769387623292,
"grad_norm": 0.6074419021606445,
"learning_rate": 6.311576752382799e-06,
"loss": 0.0163,
"num_input_tokens_seen": 969721856,
"step": 947000
},
{
"epoch": 8.742307230972218,
"grad_norm": 1.1008994579315186,
"learning_rate": 6.2885099786863e-06,
"loss": 0.014,
"num_input_tokens_seen": 970233856,
"step": 947500
},
{
"epoch": 8.746920585711518,
"grad_norm": 0.4239863157272339,
"learning_rate": 6.265443204989804e-06,
"loss": 0.0152,
"num_input_tokens_seen": 970745856,
"step": 948000
},
{
"epoch": 8.751533940450816,
"grad_norm": 0.8348074555397034,
"learning_rate": 6.242376431293308e-06,
"loss": 0.0159,
"num_input_tokens_seen": 971257856,
"step": 948500
},
{
"epoch": 8.756147295190116,
"grad_norm": 0.9429554343223572,
"learning_rate": 6.2193096575968115e-06,
"loss": 0.016,
"num_input_tokens_seen": 971769856,
"step": 949000
},
{
"epoch": 8.760760649929416,
"grad_norm": 0.8379220366477966,
"learning_rate": 6.196242883900315e-06,
"loss": 0.0144,
"num_input_tokens_seen": 972281856,
"step": 949500
},
{
"epoch": 8.765374004668715,
"grad_norm": 0.543300211429596,
"learning_rate": 6.1731761102038186e-06,
"loss": 0.0168,
"num_input_tokens_seen": 972793856,
"step": 950000
},
{
"epoch": 8.769987359408015,
"grad_norm": 1.0430985689163208,
"learning_rate": 6.150109336507322e-06,
"loss": 0.0148,
"num_input_tokens_seen": 973305856,
"step": 950500
},
{
"epoch": 8.774600714147313,
"grad_norm": 1.5497344732284546,
"learning_rate": 6.127042562810825e-06,
"loss": 0.0154,
"num_input_tokens_seen": 973817856,
"step": 951000
},
{
"epoch": 8.779214068886613,
"grad_norm": 0.5469529628753662,
"learning_rate": 6.103975789114329e-06,
"loss": 0.0136,
"num_input_tokens_seen": 974329856,
"step": 951500
},
{
"epoch": 8.783827423625912,
"grad_norm": 1.1605631113052368,
"learning_rate": 6.080909015417832e-06,
"loss": 0.0143,
"num_input_tokens_seen": 974841856,
"step": 952000
},
{
"epoch": 8.788440778365212,
"grad_norm": 0.4232845604419708,
"learning_rate": 6.057842241721335e-06,
"loss": 0.015,
"num_input_tokens_seen": 975353856,
"step": 952500
},
{
"epoch": 8.79305413310451,
"grad_norm": 0.9222050905227661,
"learning_rate": 6.034775468024838e-06,
"loss": 0.0142,
"num_input_tokens_seen": 975865856,
"step": 953000
},
{
"epoch": 8.79766748784381,
"grad_norm": 0.6866771578788757,
"learning_rate": 6.011708694328342e-06,
"loss": 0.0149,
"num_input_tokens_seen": 976377856,
"step": 953500
},
{
"epoch": 8.802280842583109,
"grad_norm": 0.7165865302085876,
"learning_rate": 5.988641920631845e-06,
"loss": 0.0153,
"num_input_tokens_seen": 976889856,
"step": 954000
},
{
"epoch": 8.806894197322409,
"grad_norm": 0.8396665453910828,
"learning_rate": 5.965575146935349e-06,
"loss": 0.0141,
"num_input_tokens_seen": 977401856,
"step": 954500
},
{
"epoch": 8.811507552061709,
"grad_norm": 0.6975528597831726,
"learning_rate": 5.942508373238852e-06,
"loss": 0.0147,
"num_input_tokens_seen": 977913856,
"step": 955000
},
{
"epoch": 8.816120906801007,
"grad_norm": 0.8357110619544983,
"learning_rate": 5.9194415995423555e-06,
"loss": 0.0148,
"num_input_tokens_seen": 978425856,
"step": 955500
},
{
"epoch": 8.820734261540307,
"grad_norm": 0.9856480956077576,
"learning_rate": 5.896374825845859e-06,
"loss": 0.0155,
"num_input_tokens_seen": 978937856,
"step": 956000
},
{
"epoch": 8.825347616279606,
"grad_norm": 1.2731949090957642,
"learning_rate": 5.8733080521493626e-06,
"loss": 0.0155,
"num_input_tokens_seen": 979449856,
"step": 956500
},
{
"epoch": 8.829960971018906,
"grad_norm": 0.7930001020431519,
"learning_rate": 5.850241278452866e-06,
"loss": 0.0143,
"num_input_tokens_seen": 979961856,
"step": 957000
},
{
"epoch": 8.834574325758204,
"grad_norm": 0.7619320154190063,
"learning_rate": 5.827174504756369e-06,
"loss": 0.016,
"num_input_tokens_seen": 980473856,
"step": 957500
},
{
"epoch": 8.839187680497504,
"grad_norm": 0.7133992314338684,
"learning_rate": 5.804107731059873e-06,
"loss": 0.0164,
"num_input_tokens_seen": 980985856,
"step": 958000
},
{
"epoch": 8.843801035236803,
"grad_norm": 0.42310747504234314,
"learning_rate": 5.781040957363375e-06,
"loss": 0.0146,
"num_input_tokens_seen": 981497856,
"step": 958500
},
{
"epoch": 8.848414389976103,
"grad_norm": 0.3348715305328369,
"learning_rate": 5.757974183666879e-06,
"loss": 0.0162,
"num_input_tokens_seen": 982009856,
"step": 959000
},
{
"epoch": 8.853027744715401,
"grad_norm": 0.6126227974891663,
"learning_rate": 5.734907409970382e-06,
"loss": 0.0141,
"num_input_tokens_seen": 982521856,
"step": 959500
},
{
"epoch": 8.857641099454701,
"grad_norm": 0.6455732583999634,
"learning_rate": 5.711840636273886e-06,
"loss": 0.0154,
"num_input_tokens_seen": 983033856,
"step": 960000
},
{
"epoch": 8.862254454194002,
"grad_norm": 1.075323224067688,
"learning_rate": 5.688773862577389e-06,
"loss": 0.016,
"num_input_tokens_seen": 983545856,
"step": 960500
},
{
"epoch": 8.8668678089333,
"grad_norm": 0.8069124817848206,
"learning_rate": 5.665707088880893e-06,
"loss": 0.0149,
"num_input_tokens_seen": 984057856,
"step": 961000
},
{
"epoch": 8.8714811636726,
"grad_norm": 0.9779102206230164,
"learning_rate": 5.642640315184396e-06,
"loss": 0.0154,
"num_input_tokens_seen": 984569856,
"step": 961500
},
{
"epoch": 8.876094518411898,
"grad_norm": 0.8441368937492371,
"learning_rate": 5.6195735414878994e-06,
"loss": 0.0154,
"num_input_tokens_seen": 985081856,
"step": 962000
},
{
"epoch": 8.880707873151199,
"grad_norm": 0.44055867195129395,
"learning_rate": 5.5965067677914026e-06,
"loss": 0.014,
"num_input_tokens_seen": 985593856,
"step": 962500
},
{
"epoch": 8.885321227890497,
"grad_norm": 1.1985424757003784,
"learning_rate": 5.5734399940949065e-06,
"loss": 0.0156,
"num_input_tokens_seen": 986105856,
"step": 963000
},
{
"epoch": 8.889934582629797,
"grad_norm": 1.8032441139221191,
"learning_rate": 5.55037322039841e-06,
"loss": 0.017,
"num_input_tokens_seen": 986617856,
"step": 963500
},
{
"epoch": 8.894547937369095,
"grad_norm": 2.679948329925537,
"learning_rate": 5.527306446701913e-06,
"loss": 0.016,
"num_input_tokens_seen": 987129856,
"step": 964000
},
{
"epoch": 8.899161292108396,
"grad_norm": 1.422170639038086,
"learning_rate": 5.504239673005417e-06,
"loss": 0.0152,
"num_input_tokens_seen": 987641856,
"step": 964500
},
{
"epoch": 8.903774646847694,
"grad_norm": 0.785531759262085,
"learning_rate": 5.481172899308919e-06,
"loss": 0.0157,
"num_input_tokens_seen": 988153856,
"step": 965000
},
{
"epoch": 8.908388001586994,
"grad_norm": 0.813910961151123,
"learning_rate": 5.458106125612423e-06,
"loss": 0.0167,
"num_input_tokens_seen": 988665856,
"step": 965500
},
{
"epoch": 8.913001356326294,
"grad_norm": 0.6769202351570129,
"learning_rate": 5.435039351915926e-06,
"loss": 0.0159,
"num_input_tokens_seen": 989177856,
"step": 966000
},
{
"epoch": 8.917614711065593,
"grad_norm": 2.5310189723968506,
"learning_rate": 5.41197257821943e-06,
"loss": 0.0152,
"num_input_tokens_seen": 989689856,
"step": 966500
},
{
"epoch": 8.922228065804893,
"grad_norm": 0.5400819182395935,
"learning_rate": 5.388905804522933e-06,
"loss": 0.0152,
"num_input_tokens_seen": 990201856,
"step": 967000
},
{
"epoch": 8.926841420544191,
"grad_norm": 0.33608752489089966,
"learning_rate": 5.365839030826437e-06,
"loss": 0.0153,
"num_input_tokens_seen": 990713856,
"step": 967500
},
{
"epoch": 8.931454775283491,
"grad_norm": 0.6144788265228271,
"learning_rate": 5.34277225712994e-06,
"loss": 0.0151,
"num_input_tokens_seen": 991225856,
"step": 968000
},
{
"epoch": 8.93606813002279,
"grad_norm": 0.8687652349472046,
"learning_rate": 5.3197054834334434e-06,
"loss": 0.016,
"num_input_tokens_seen": 991737856,
"step": 968500
},
{
"epoch": 8.94068148476209,
"grad_norm": 0.9648618698120117,
"learning_rate": 5.2966387097369466e-06,
"loss": 0.0166,
"num_input_tokens_seen": 992249856,
"step": 969000
},
{
"epoch": 8.945294839501388,
"grad_norm": 0.6023857593536377,
"learning_rate": 5.27357193604045e-06,
"loss": 0.0144,
"num_input_tokens_seen": 992761856,
"step": 969500
},
{
"epoch": 8.949908194240688,
"grad_norm": 1.8448054790496826,
"learning_rate": 5.250505162343954e-06,
"loss": 0.0155,
"num_input_tokens_seen": 993273856,
"step": 970000
},
{
"epoch": 8.954521548979987,
"grad_norm": 0.6951389312744141,
"learning_rate": 5.227438388647457e-06,
"loss": 0.0154,
"num_input_tokens_seen": 993785856,
"step": 970500
},
{
"epoch": 8.959134903719287,
"grad_norm": 0.5784729719161987,
"learning_rate": 5.204371614950961e-06,
"loss": 0.0154,
"num_input_tokens_seen": 994297856,
"step": 971000
},
{
"epoch": 8.963748258458587,
"grad_norm": 1.4732640981674194,
"learning_rate": 5.181304841254463e-06,
"loss": 0.0147,
"num_input_tokens_seen": 994809856,
"step": 971500
},
{
"epoch": 8.968361613197885,
"grad_norm": 0.9267556667327881,
"learning_rate": 5.158238067557967e-06,
"loss": 0.0149,
"num_input_tokens_seen": 995321856,
"step": 972000
},
{
"epoch": 8.972974967937185,
"grad_norm": 0.3285810053348541,
"learning_rate": 5.13517129386147e-06,
"loss": 0.0146,
"num_input_tokens_seen": 995833856,
"step": 972500
},
{
"epoch": 8.977588322676484,
"grad_norm": 1.0577844381332397,
"learning_rate": 5.112104520164974e-06,
"loss": 0.0142,
"num_input_tokens_seen": 996345856,
"step": 973000
},
{
"epoch": 8.982201677415784,
"grad_norm": 0.40497535467147827,
"learning_rate": 5.089037746468477e-06,
"loss": 0.0157,
"num_input_tokens_seen": 996857856,
"step": 973500
},
{
"epoch": 8.986815032155082,
"grad_norm": 0.6067364811897278,
"learning_rate": 5.065970972771981e-06,
"loss": 0.0155,
"num_input_tokens_seen": 997369856,
"step": 974000
},
{
"epoch": 8.991428386894382,
"grad_norm": 0.5121076703071594,
"learning_rate": 5.042904199075484e-06,
"loss": 0.0145,
"num_input_tokens_seen": 997881856,
"step": 974500
},
{
"epoch": 8.99604174163368,
"grad_norm": 1.0173983573913574,
"learning_rate": 5.0198374253789874e-06,
"loss": 0.0154,
"num_input_tokens_seen": 998393856,
"step": 975000
},
{
"epoch": 9.0,
"eval_combined_score": 0.06468997752487994,
"eval_loss": 0.06468997895717621,
"eval_mse": 0.06468997609258367,
"eval_runtime": 45.8521,
"eval_samples_per_second": 2101.059,
"eval_steps_per_second": 262.649,
"num_input_tokens_seen": 998832384,
"step": 975429
},
{
"epoch": 9.00065509637298,
"grad_norm": 0.4236084818840027,
"learning_rate": 4.9967706516824906e-06,
"loss": 0.0159,
"num_input_tokens_seen": 998905088,
"step": 975500
},
{
"epoch": 9.00526845111228,
"grad_norm": 0.6183050870895386,
"learning_rate": 4.973703877985994e-06,
"loss": 0.0138,
"num_input_tokens_seen": 999417088,
"step": 976000
},
{
"epoch": 9.00988180585158,
"grad_norm": 3.3244409561157227,
"learning_rate": 4.950637104289498e-06,
"loss": 0.0136,
"num_input_tokens_seen": 999929088,
"step": 976500
},
{
"epoch": 9.014495160590878,
"grad_norm": 0.5056183934211731,
"learning_rate": 4.927570330593001e-06,
"loss": 0.0133,
"num_input_tokens_seen": 1000441088,
"step": 977000
},
{
"epoch": 9.019108515330178,
"grad_norm": 0.6775535941123962,
"learning_rate": 4.904503556896505e-06,
"loss": 0.0135,
"num_input_tokens_seen": 1000953088,
"step": 977500
},
{
"epoch": 9.023721870069478,
"grad_norm": 0.4014028012752533,
"learning_rate": 4.881436783200007e-06,
"loss": 0.013,
"num_input_tokens_seen": 1001465088,
"step": 978000
},
{
"epoch": 9.028335224808776,
"grad_norm": 0.6904358863830566,
"learning_rate": 4.858370009503511e-06,
"loss": 0.0128,
"num_input_tokens_seen": 1001977088,
"step": 978500
},
{
"epoch": 9.032948579548076,
"grad_norm": 1.717046856880188,
"learning_rate": 4.835303235807014e-06,
"loss": 0.0135,
"num_input_tokens_seen": 1002489088,
"step": 979000
},
{
"epoch": 9.037561934287375,
"grad_norm": 1.1280878782272339,
"learning_rate": 4.812236462110518e-06,
"loss": 0.0141,
"num_input_tokens_seen": 1003001088,
"step": 979500
},
{
"epoch": 9.042175289026675,
"grad_norm": 0.9828783869743347,
"learning_rate": 4.789169688414021e-06,
"loss": 0.0139,
"num_input_tokens_seen": 1003513088,
"step": 980000
},
{
"epoch": 9.046788643765973,
"grad_norm": 0.9039996266365051,
"learning_rate": 4.766102914717524e-06,
"loss": 0.013,
"num_input_tokens_seen": 1004025088,
"step": 980500
},
{
"epoch": 9.051401998505273,
"grad_norm": 0.760273277759552,
"learning_rate": 4.743036141021028e-06,
"loss": 0.0129,
"num_input_tokens_seen": 1004537088,
"step": 981000
},
{
"epoch": 9.056015353244572,
"grad_norm": 0.6820119619369507,
"learning_rate": 4.719969367324531e-06,
"loss": 0.0138,
"num_input_tokens_seen": 1005049088,
"step": 981500
},
{
"epoch": 9.060628707983872,
"grad_norm": 0.8274890780448914,
"learning_rate": 4.6969025936280346e-06,
"loss": 0.0137,
"num_input_tokens_seen": 1005561088,
"step": 982000
},
{
"epoch": 9.06524206272317,
"grad_norm": 0.43844661116600037,
"learning_rate": 4.673835819931538e-06,
"loss": 0.0133,
"num_input_tokens_seen": 1006073088,
"step": 982500
},
{
"epoch": 9.06985541746247,
"grad_norm": 1.0397804975509644,
"learning_rate": 4.650769046235042e-06,
"loss": 0.0131,
"num_input_tokens_seen": 1006585088,
"step": 983000
},
{
"epoch": 9.07446877220177,
"grad_norm": 1.1185849905014038,
"learning_rate": 4.627702272538545e-06,
"loss": 0.0139,
"num_input_tokens_seen": 1007097088,
"step": 983500
},
{
"epoch": 9.079082126941069,
"grad_norm": 0.4616248607635498,
"learning_rate": 4.604635498842049e-06,
"loss": 0.0136,
"num_input_tokens_seen": 1007609088,
"step": 984000
},
{
"epoch": 9.083695481680369,
"grad_norm": 0.4887053966522217,
"learning_rate": 4.581568725145551e-06,
"loss": 0.0119,
"num_input_tokens_seen": 1008121088,
"step": 984500
},
{
"epoch": 9.088308836419667,
"grad_norm": 0.9657731056213379,
"learning_rate": 4.558501951449055e-06,
"loss": 0.0134,
"num_input_tokens_seen": 1008633088,
"step": 985000
},
{
"epoch": 9.092922191158967,
"grad_norm": 0.6589749455451965,
"learning_rate": 4.535435177752558e-06,
"loss": 0.0137,
"num_input_tokens_seen": 1009145088,
"step": 985500
},
{
"epoch": 9.097535545898266,
"grad_norm": 1.095737338066101,
"learning_rate": 4.512368404056062e-06,
"loss": 0.0136,
"num_input_tokens_seen": 1009657088,
"step": 986000
},
{
"epoch": 9.102148900637566,
"grad_norm": 0.9578360915184021,
"learning_rate": 4.489301630359565e-06,
"loss": 0.0144,
"num_input_tokens_seen": 1010169088,
"step": 986500
},
{
"epoch": 9.106762255376864,
"grad_norm": 1.0494704246520996,
"learning_rate": 4.466234856663068e-06,
"loss": 0.0143,
"num_input_tokens_seen": 1010681088,
"step": 987000
},
{
"epoch": 9.111375610116164,
"grad_norm": 0.3351483643054962,
"learning_rate": 4.443168082966572e-06,
"loss": 0.0143,
"num_input_tokens_seen": 1011193088,
"step": 987500
},
{
"epoch": 9.115988964855463,
"grad_norm": 1.107553482055664,
"learning_rate": 4.4201013092700746e-06,
"loss": 0.0158,
"num_input_tokens_seen": 1011705088,
"step": 988000
},
{
"epoch": 9.120602319594763,
"grad_norm": 0.8427937626838684,
"learning_rate": 4.3970345355735785e-06,
"loss": 0.0134,
"num_input_tokens_seen": 1012217088,
"step": 988500
},
{
"epoch": 9.125215674334063,
"grad_norm": 0.5374360084533691,
"learning_rate": 4.373967761877082e-06,
"loss": 0.0121,
"num_input_tokens_seen": 1012729088,
"step": 989000
},
{
"epoch": 9.129829029073361,
"grad_norm": 1.2801436185836792,
"learning_rate": 4.350900988180586e-06,
"loss": 0.0134,
"num_input_tokens_seen": 1013241088,
"step": 989500
},
{
"epoch": 9.134442383812662,
"grad_norm": 2.0048415660858154,
"learning_rate": 4.327834214484089e-06,
"loss": 0.0132,
"num_input_tokens_seen": 1013753088,
"step": 990000
},
{
"epoch": 9.13905573855196,
"grad_norm": 1.3461086750030518,
"learning_rate": 4.304767440787593e-06,
"loss": 0.0136,
"num_input_tokens_seen": 1014265088,
"step": 990500
},
{
"epoch": 9.14366909329126,
"grad_norm": 0.5770676732063293,
"learning_rate": 4.281700667091096e-06,
"loss": 0.0142,
"num_input_tokens_seen": 1014777088,
"step": 991000
},
{
"epoch": 9.148282448030558,
"grad_norm": 0.7648055553436279,
"learning_rate": 4.258633893394599e-06,
"loss": 0.0134,
"num_input_tokens_seen": 1015289088,
"step": 991500
},
{
"epoch": 9.152895802769859,
"grad_norm": 0.8219977021217346,
"learning_rate": 4.235567119698102e-06,
"loss": 0.0139,
"num_input_tokens_seen": 1015801088,
"step": 992000
},
{
"epoch": 9.157509157509157,
"grad_norm": 0.2618965804576874,
"learning_rate": 4.212500346001605e-06,
"loss": 0.0146,
"num_input_tokens_seen": 1016313088,
"step": 992500
},
{
"epoch": 9.162122512248457,
"grad_norm": 0.580898642539978,
"learning_rate": 4.189433572305109e-06,
"loss": 0.0131,
"num_input_tokens_seen": 1016825088,
"step": 993000
},
{
"epoch": 9.166735866987755,
"grad_norm": 1.426604151725769,
"learning_rate": 4.166366798608612e-06,
"loss": 0.0137,
"num_input_tokens_seen": 1017337088,
"step": 993500
},
{
"epoch": 9.171349221727056,
"grad_norm": 0.4607691764831543,
"learning_rate": 4.143300024912116e-06,
"loss": 0.0126,
"num_input_tokens_seen": 1017849088,
"step": 994000
},
{
"epoch": 9.175962576466354,
"grad_norm": 0.5528801083564758,
"learning_rate": 4.1202332512156186e-06,
"loss": 0.0125,
"num_input_tokens_seen": 1018361088,
"step": 994500
},
{
"epoch": 9.180575931205654,
"grad_norm": 0.24360989034175873,
"learning_rate": 4.0971664775191225e-06,
"loss": 0.0117,
"num_input_tokens_seen": 1018873088,
"step": 995000
},
{
"epoch": 9.185189285944954,
"grad_norm": 0.5846107602119446,
"learning_rate": 4.074099703822626e-06,
"loss": 0.0141,
"num_input_tokens_seen": 1019385088,
"step": 995500
},
{
"epoch": 9.189802640684253,
"grad_norm": 0.8627530932426453,
"learning_rate": 4.05103293012613e-06,
"loss": 0.0129,
"num_input_tokens_seen": 1019897088,
"step": 996000
},
{
"epoch": 9.194415995423553,
"grad_norm": 0.7435634732246399,
"learning_rate": 4.027966156429633e-06,
"loss": 0.0139,
"num_input_tokens_seen": 1020409088,
"step": 996500
},
{
"epoch": 9.199029350162851,
"grad_norm": 0.6394104957580566,
"learning_rate": 4.004899382733137e-06,
"loss": 0.0146,
"num_input_tokens_seen": 1020921088,
"step": 997000
},
{
"epoch": 9.203642704902151,
"grad_norm": 0.4735194444656372,
"learning_rate": 3.98183260903664e-06,
"loss": 0.0133,
"num_input_tokens_seen": 1021433088,
"step": 997500
},
{
"epoch": 9.20825605964145,
"grad_norm": 0.9603920578956604,
"learning_rate": 3.958765835340143e-06,
"loss": 0.013,
"num_input_tokens_seen": 1021945088,
"step": 998000
},
{
"epoch": 9.21286941438075,
"grad_norm": 1.0817182064056396,
"learning_rate": 3.935699061643646e-06,
"loss": 0.0117,
"num_input_tokens_seen": 1022457088,
"step": 998500
},
{
"epoch": 9.217482769120048,
"grad_norm": 0.5785081386566162,
"learning_rate": 3.912632287947149e-06,
"loss": 0.0127,
"num_input_tokens_seen": 1022969088,
"step": 999000
},
{
"epoch": 9.222096123859348,
"grad_norm": 0.34806227684020996,
"learning_rate": 3.889565514250653e-06,
"loss": 0.0129,
"num_input_tokens_seen": 1023481088,
"step": 999500
},
{
"epoch": 9.226709478598647,
"grad_norm": 0.8392277359962463,
"learning_rate": 3.866498740554156e-06,
"loss": 0.0128,
"num_input_tokens_seen": 1023993088,
"step": 1000000
},
{
"epoch": 9.231322833337947,
"grad_norm": 0.34862348437309265,
"learning_rate": 3.84343196685766e-06,
"loss": 0.0147,
"num_input_tokens_seen": 1024505088,
"step": 1000500
},
{
"epoch": 9.235936188077247,
"grad_norm": 0.8864858150482178,
"learning_rate": 3.8203651931611626e-06,
"loss": 0.013,
"num_input_tokens_seen": 1025017088,
"step": 1001000
},
{
"epoch": 9.240549542816545,
"grad_norm": 0.7740064263343811,
"learning_rate": 3.797298419464666e-06,
"loss": 0.0128,
"num_input_tokens_seen": 1025529088,
"step": 1001500
},
{
"epoch": 9.245162897555845,
"grad_norm": 0.21236860752105713,
"learning_rate": 3.7742316457681697e-06,
"loss": 0.013,
"num_input_tokens_seen": 1026041088,
"step": 1002000
},
{
"epoch": 9.249776252295144,
"grad_norm": 0.5248683094978333,
"learning_rate": 3.751164872071673e-06,
"loss": 0.0137,
"num_input_tokens_seen": 1026553088,
"step": 1002500
},
{
"epoch": 9.254389607034444,
"grad_norm": 0.49671700596809387,
"learning_rate": 3.7280980983751767e-06,
"loss": 0.0129,
"num_input_tokens_seen": 1027065088,
"step": 1003000
},
{
"epoch": 9.259002961773742,
"grad_norm": 0.7748130559921265,
"learning_rate": 3.7050313246786803e-06,
"loss": 0.013,
"num_input_tokens_seen": 1027577088,
"step": 1003500
},
{
"epoch": 9.263616316513042,
"grad_norm": 0.5696319341659546,
"learning_rate": 3.681964550982184e-06,
"loss": 0.0128,
"num_input_tokens_seen": 1028089088,
"step": 1004000
},
{
"epoch": 9.26822967125234,
"grad_norm": 1.47969651222229,
"learning_rate": 3.6588977772856865e-06,
"loss": 0.0134,
"num_input_tokens_seen": 1028601088,
"step": 1004500
},
{
"epoch": 9.27284302599164,
"grad_norm": 0.6833159923553467,
"learning_rate": 3.63583100358919e-06,
"loss": 0.0145,
"num_input_tokens_seen": 1029113088,
"step": 1005000
},
{
"epoch": 9.27745638073094,
"grad_norm": 0.9838703870773315,
"learning_rate": 3.6127642298926936e-06,
"loss": 0.0141,
"num_input_tokens_seen": 1029625088,
"step": 1005500
},
{
"epoch": 9.28206973547024,
"grad_norm": 0.5185501575469971,
"learning_rate": 3.589697456196197e-06,
"loss": 0.0144,
"num_input_tokens_seen": 1030137088,
"step": 1006000
},
{
"epoch": 9.28668309020954,
"grad_norm": 0.6044150590896606,
"learning_rate": 3.5666306824997003e-06,
"loss": 0.0132,
"num_input_tokens_seen": 1030649088,
"step": 1006500
},
{
"epoch": 9.291296444948838,
"grad_norm": 0.5589469075202942,
"learning_rate": 3.543563908803204e-06,
"loss": 0.0137,
"num_input_tokens_seen": 1031161088,
"step": 1007000
},
{
"epoch": 9.295909799688138,
"grad_norm": 0.8428828120231628,
"learning_rate": 3.5204971351067066e-06,
"loss": 0.0132,
"num_input_tokens_seen": 1031673088,
"step": 1007500
},
{
"epoch": 9.300523154427436,
"grad_norm": 1.0949701070785522,
"learning_rate": 3.49743036141021e-06,
"loss": 0.0132,
"num_input_tokens_seen": 1032185088,
"step": 1008000
},
{
"epoch": 9.305136509166736,
"grad_norm": 0.48161888122558594,
"learning_rate": 3.4743635877137136e-06,
"loss": 0.0138,
"num_input_tokens_seen": 1032697088,
"step": 1008500
},
{
"epoch": 9.309749863906035,
"grad_norm": 1.4229580163955688,
"learning_rate": 3.451296814017217e-06,
"loss": 0.013,
"num_input_tokens_seen": 1033209088,
"step": 1009000
},
{
"epoch": 9.314363218645335,
"grad_norm": 1.3797547817230225,
"learning_rate": 3.4282300403207207e-06,
"loss": 0.0143,
"num_input_tokens_seen": 1033721088,
"step": 1009500
},
{
"epoch": 9.318976573384633,
"grad_norm": 0.764750599861145,
"learning_rate": 3.4051632666242243e-06,
"loss": 0.0124,
"num_input_tokens_seen": 1034233088,
"step": 1010000
},
{
"epoch": 9.323589928123933,
"grad_norm": 1.4155054092407227,
"learning_rate": 3.382096492927728e-06,
"loss": 0.0143,
"num_input_tokens_seen": 1034745088,
"step": 1010500
},
{
"epoch": 9.328203282863232,
"grad_norm": 0.5639691352844238,
"learning_rate": 3.3590297192312305e-06,
"loss": 0.0123,
"num_input_tokens_seen": 1035257088,
"step": 1011000
},
{
"epoch": 9.332816637602532,
"grad_norm": 1.6954376697540283,
"learning_rate": 3.335962945534734e-06,
"loss": 0.0158,
"num_input_tokens_seen": 1035769088,
"step": 1011500
},
{
"epoch": 9.337429992341832,
"grad_norm": 1.096420168876648,
"learning_rate": 3.3128961718382376e-06,
"loss": 0.0134,
"num_input_tokens_seen": 1036281088,
"step": 1012000
},
{
"epoch": 9.34204334708113,
"grad_norm": 0.7063207626342773,
"learning_rate": 3.2898293981417408e-06,
"loss": 0.0142,
"num_input_tokens_seen": 1036793088,
"step": 1012500
},
{
"epoch": 9.34665670182043,
"grad_norm": 1.40740966796875,
"learning_rate": 3.2667626244452443e-06,
"loss": 0.0133,
"num_input_tokens_seen": 1037305088,
"step": 1013000
},
{
"epoch": 9.351270056559729,
"grad_norm": 1.0713701248168945,
"learning_rate": 3.243695850748748e-06,
"loss": 0.0122,
"num_input_tokens_seen": 1037817088,
"step": 1013500
},
{
"epoch": 9.355883411299029,
"grad_norm": 0.41992899775505066,
"learning_rate": 3.2206290770522505e-06,
"loss": 0.0114,
"num_input_tokens_seen": 1038329088,
"step": 1014000
},
{
"epoch": 9.360496766038327,
"grad_norm": 0.42630577087402344,
"learning_rate": 3.197562303355754e-06,
"loss": 0.0147,
"num_input_tokens_seen": 1038841088,
"step": 1014500
},
{
"epoch": 9.365110120777628,
"grad_norm": 1.1027462482452393,
"learning_rate": 3.1744955296592576e-06,
"loss": 0.0131,
"num_input_tokens_seen": 1039353088,
"step": 1015000
},
{
"epoch": 9.369723475516926,
"grad_norm": 0.5520905256271362,
"learning_rate": 3.151428755962761e-06,
"loss": 0.0139,
"num_input_tokens_seen": 1039865088,
"step": 1015500
},
{
"epoch": 9.374336830256226,
"grad_norm": 0.46760430932044983,
"learning_rate": 3.1283619822662647e-06,
"loss": 0.0142,
"num_input_tokens_seen": 1040377088,
"step": 1016000
},
{
"epoch": 9.378950184995524,
"grad_norm": 0.5815434455871582,
"learning_rate": 3.105295208569768e-06,
"loss": 0.0142,
"num_input_tokens_seen": 1040889088,
"step": 1016500
},
{
"epoch": 9.383563539734824,
"grad_norm": 1.3620293140411377,
"learning_rate": 3.0822284348732714e-06,
"loss": 0.0139,
"num_input_tokens_seen": 1041401088,
"step": 1017000
},
{
"epoch": 9.388176894474123,
"grad_norm": 0.8543253540992737,
"learning_rate": 3.059161661176775e-06,
"loss": 0.0128,
"num_input_tokens_seen": 1041913088,
"step": 1017500
},
{
"epoch": 9.392790249213423,
"grad_norm": 1.2159240245819092,
"learning_rate": 3.036094887480278e-06,
"loss": 0.0138,
"num_input_tokens_seen": 1042425088,
"step": 1018000
},
{
"epoch": 9.397403603952723,
"grad_norm": 0.7059375643730164,
"learning_rate": 3.013028113783781e-06,
"loss": 0.0137,
"num_input_tokens_seen": 1042937088,
"step": 1018500
},
{
"epoch": 9.402016958692021,
"grad_norm": 0.45824775099754333,
"learning_rate": 2.9899613400872847e-06,
"loss": 0.0135,
"num_input_tokens_seen": 1043449088,
"step": 1019000
},
{
"epoch": 9.406630313431322,
"grad_norm": 0.6606787443161011,
"learning_rate": 2.9668945663907883e-06,
"loss": 0.0138,
"num_input_tokens_seen": 1043961088,
"step": 1019500
},
{
"epoch": 9.41124366817062,
"grad_norm": 0.8153837323188782,
"learning_rate": 2.9438277926942914e-06,
"loss": 0.0125,
"num_input_tokens_seen": 1044473088,
"step": 1020000
},
{
"epoch": 9.41585702290992,
"grad_norm": 0.4770793318748474,
"learning_rate": 2.920761018997795e-06,
"loss": 0.0135,
"num_input_tokens_seen": 1044985088,
"step": 1020500
},
{
"epoch": 9.420470377649218,
"grad_norm": 1.226976990699768,
"learning_rate": 2.8976942453012985e-06,
"loss": 0.0132,
"num_input_tokens_seen": 1045497088,
"step": 1021000
},
{
"epoch": 9.425083732388519,
"grad_norm": 0.3825905919075012,
"learning_rate": 2.8746274716048016e-06,
"loss": 0.0125,
"num_input_tokens_seen": 1046009088,
"step": 1021500
},
{
"epoch": 9.429697087127817,
"grad_norm": 0.6580853462219238,
"learning_rate": 2.851560697908305e-06,
"loss": 0.014,
"num_input_tokens_seen": 1046521088,
"step": 1022000
},
{
"epoch": 9.434310441867117,
"grad_norm": 1.0704902410507202,
"learning_rate": 2.8284939242118087e-06,
"loss": 0.013,
"num_input_tokens_seen": 1047033088,
"step": 1022500
},
{
"epoch": 9.438923796606415,
"grad_norm": 1.5487003326416016,
"learning_rate": 2.805427150515312e-06,
"loss": 0.0156,
"num_input_tokens_seen": 1047545088,
"step": 1023000
},
{
"epoch": 9.443537151345716,
"grad_norm": 0.4171670079231262,
"learning_rate": 2.7823603768188154e-06,
"loss": 0.0127,
"num_input_tokens_seen": 1048057088,
"step": 1023500
},
{
"epoch": 9.448150506085016,
"grad_norm": 1.1898133754730225,
"learning_rate": 2.7592936031223185e-06,
"loss": 0.0157,
"num_input_tokens_seen": 1048569088,
"step": 1024000
},
{
"epoch": 9.452763860824314,
"grad_norm": 0.4748603105545044,
"learning_rate": 2.7362268294258216e-06,
"loss": 0.0132,
"num_input_tokens_seen": 1049081088,
"step": 1024500
},
{
"epoch": 9.457377215563614,
"grad_norm": 1.6988264322280884,
"learning_rate": 2.713160055729325e-06,
"loss": 0.0139,
"num_input_tokens_seen": 1049593088,
"step": 1025000
},
{
"epoch": 9.461990570302913,
"grad_norm": 1.1586196422576904,
"learning_rate": 2.6900932820328287e-06,
"loss": 0.0144,
"num_input_tokens_seen": 1050105088,
"step": 1025500
},
{
"epoch": 9.466603925042213,
"grad_norm": 1.3323612213134766,
"learning_rate": 2.6670265083363323e-06,
"loss": 0.0117,
"num_input_tokens_seen": 1050617088,
"step": 1026000
},
{
"epoch": 9.471217279781511,
"grad_norm": 0.6006079316139221,
"learning_rate": 2.6439597346398354e-06,
"loss": 0.0147,
"num_input_tokens_seen": 1051129088,
"step": 1026500
},
{
"epoch": 9.475830634520811,
"grad_norm": 0.9578723907470703,
"learning_rate": 2.620892960943339e-06,
"loss": 0.0122,
"num_input_tokens_seen": 1051641088,
"step": 1027000
},
{
"epoch": 9.48044398926011,
"grad_norm": 0.9589295983314514,
"learning_rate": 2.5978261872468425e-06,
"loss": 0.0136,
"num_input_tokens_seen": 1052153088,
"step": 1027500
},
{
"epoch": 9.48505734399941,
"grad_norm": 1.320854663848877,
"learning_rate": 2.5747594135503456e-06,
"loss": 0.0166,
"num_input_tokens_seen": 1052665088,
"step": 1028000
},
{
"epoch": 9.489670698738708,
"grad_norm": 0.5850228071212769,
"learning_rate": 2.551692639853849e-06,
"loss": 0.0127,
"num_input_tokens_seen": 1053177088,
"step": 1028500
},
{
"epoch": 9.494284053478008,
"grad_norm": 0.4947618544101715,
"learning_rate": 2.5286258661573527e-06,
"loss": 0.013,
"num_input_tokens_seen": 1053689088,
"step": 1029000
},
{
"epoch": 9.498897408217307,
"grad_norm": 1.5554652214050293,
"learning_rate": 2.505559092460856e-06,
"loss": 0.0114,
"num_input_tokens_seen": 1054201088,
"step": 1029500
},
{
"epoch": 9.503510762956607,
"grad_norm": 0.7134987711906433,
"learning_rate": 2.482492318764359e-06,
"loss": 0.0131,
"num_input_tokens_seen": 1054713088,
"step": 1030000
},
{
"epoch": 9.508124117695907,
"grad_norm": 0.6300977468490601,
"learning_rate": 2.4594255450678625e-06,
"loss": 0.0132,
"num_input_tokens_seen": 1055225088,
"step": 1030500
},
{
"epoch": 9.512737472435205,
"grad_norm": 0.30723100900650024,
"learning_rate": 2.4363587713713656e-06,
"loss": 0.012,
"num_input_tokens_seen": 1055737088,
"step": 1031000
},
{
"epoch": 9.517350827174505,
"grad_norm": 0.5518991947174072,
"learning_rate": 2.413291997674869e-06,
"loss": 0.0125,
"num_input_tokens_seen": 1056249088,
"step": 1031500
},
{
"epoch": 9.521964181913804,
"grad_norm": 0.48715853691101074,
"learning_rate": 2.3902252239783727e-06,
"loss": 0.0147,
"num_input_tokens_seen": 1056761088,
"step": 1032000
},
{
"epoch": 9.526577536653104,
"grad_norm": 0.9060729742050171,
"learning_rate": 2.3671584502818763e-06,
"loss": 0.0138,
"num_input_tokens_seen": 1057273088,
"step": 1032500
},
{
"epoch": 9.531190891392402,
"grad_norm": 0.6399810910224915,
"learning_rate": 2.3440916765853794e-06,
"loss": 0.0154,
"num_input_tokens_seen": 1057785088,
"step": 1033000
},
{
"epoch": 9.535804246131702,
"grad_norm": 0.8663894534111023,
"learning_rate": 2.321024902888883e-06,
"loss": 0.0128,
"num_input_tokens_seen": 1058297088,
"step": 1033500
},
{
"epoch": 9.540417600871,
"grad_norm": 1.554218053817749,
"learning_rate": 2.2979581291923865e-06,
"loss": 0.013,
"num_input_tokens_seen": 1058809088,
"step": 1034000
},
{
"epoch": 9.5450309556103,
"grad_norm": 0.5967795848846436,
"learning_rate": 2.2748913554958896e-06,
"loss": 0.0136,
"num_input_tokens_seen": 1059321088,
"step": 1034500
},
{
"epoch": 9.5496443103496,
"grad_norm": 0.7761898040771484,
"learning_rate": 2.251824581799393e-06,
"loss": 0.0137,
"num_input_tokens_seen": 1059833088,
"step": 1035000
},
{
"epoch": 9.5542576650889,
"grad_norm": 0.4565838873386383,
"learning_rate": 2.2287578081028963e-06,
"loss": 0.0137,
"num_input_tokens_seen": 1060345088,
"step": 1035500
},
{
"epoch": 9.5588710198282,
"grad_norm": 1.4918292760849,
"learning_rate": 2.2056910344063994e-06,
"loss": 0.0154,
"num_input_tokens_seen": 1060857088,
"step": 1036000
},
{
"epoch": 9.563484374567498,
"grad_norm": 1.143227458000183,
"learning_rate": 2.182624260709903e-06,
"loss": 0.0126,
"num_input_tokens_seen": 1061369088,
"step": 1036500
},
{
"epoch": 9.568097729306798,
"grad_norm": 0.4711507558822632,
"learning_rate": 2.1595574870134065e-06,
"loss": 0.0139,
"num_input_tokens_seen": 1061881088,
"step": 1037000
},
{
"epoch": 9.572711084046096,
"grad_norm": 1.8225018978118896,
"learning_rate": 2.1364907133169096e-06,
"loss": 0.0148,
"num_input_tokens_seen": 1062393088,
"step": 1037500
},
{
"epoch": 9.577324438785396,
"grad_norm": 1.6516982316970825,
"learning_rate": 2.113423939620413e-06,
"loss": 0.0139,
"num_input_tokens_seen": 1062905088,
"step": 1038000
},
{
"epoch": 9.581937793524695,
"grad_norm": 0.6592885255813599,
"learning_rate": 2.0903571659239167e-06,
"loss": 0.0135,
"num_input_tokens_seen": 1063417088,
"step": 1038500
},
{
"epoch": 9.586551148263995,
"grad_norm": 0.9162536263465881,
"learning_rate": 2.0672903922274203e-06,
"loss": 0.0143,
"num_input_tokens_seen": 1063929088,
"step": 1039000
},
{
"epoch": 9.591164503003293,
"grad_norm": 1.3136478662490845,
"learning_rate": 2.0442236185309234e-06,
"loss": 0.0145,
"num_input_tokens_seen": 1064441088,
"step": 1039500
},
{
"epoch": 9.595777857742593,
"grad_norm": 0.8929975032806396,
"learning_rate": 2.021156844834427e-06,
"loss": 0.0124,
"num_input_tokens_seen": 1064953088,
"step": 1040000
},
{
"epoch": 9.600391212481892,
"grad_norm": 0.6862032413482666,
"learning_rate": 1.9980900711379305e-06,
"loss": 0.0116,
"num_input_tokens_seen": 1065465088,
"step": 1040500
},
{
"epoch": 9.605004567221192,
"grad_norm": 1.4420340061187744,
"learning_rate": 1.9750232974414336e-06,
"loss": 0.0131,
"num_input_tokens_seen": 1065977088,
"step": 1041000
},
{
"epoch": 9.609617921960492,
"grad_norm": 0.6107918620109558,
"learning_rate": 1.9519565237449367e-06,
"loss": 0.0128,
"num_input_tokens_seen": 1066489088,
"step": 1041500
},
{
"epoch": 9.61423127669979,
"grad_norm": 0.8065725564956665,
"learning_rate": 1.9288897500484403e-06,
"loss": 0.0136,
"num_input_tokens_seen": 1067001088,
"step": 1042000
},
{
"epoch": 9.61884463143909,
"grad_norm": 1.1736738681793213,
"learning_rate": 1.9058229763519436e-06,
"loss": 0.0142,
"num_input_tokens_seen": 1067513088,
"step": 1042500
},
{
"epoch": 9.623457986178389,
"grad_norm": 3.729763984680176,
"learning_rate": 1.882756202655447e-06,
"loss": 0.0131,
"num_input_tokens_seen": 1068025088,
"step": 1043000
},
{
"epoch": 9.628071340917689,
"grad_norm": 0.39236801862716675,
"learning_rate": 1.8596894289589505e-06,
"loss": 0.014,
"num_input_tokens_seen": 1068537088,
"step": 1043500
},
{
"epoch": 9.632684695656987,
"grad_norm": 1.0780402421951294,
"learning_rate": 1.8366226552624536e-06,
"loss": 0.0112,
"num_input_tokens_seen": 1069049088,
"step": 1044000
},
{
"epoch": 9.637298050396288,
"grad_norm": 0.5110656023025513,
"learning_rate": 1.8135558815659572e-06,
"loss": 0.0127,
"num_input_tokens_seen": 1069561088,
"step": 1044500
},
{
"epoch": 9.641911405135586,
"grad_norm": 0.23593804240226746,
"learning_rate": 1.7904891078694607e-06,
"loss": 0.0131,
"num_input_tokens_seen": 1070073088,
"step": 1045000
},
{
"epoch": 9.646524759874886,
"grad_norm": 0.9505711793899536,
"learning_rate": 1.767422334172964e-06,
"loss": 0.0125,
"num_input_tokens_seen": 1070585088,
"step": 1045500
},
{
"epoch": 9.651138114614184,
"grad_norm": 0.9649909138679504,
"learning_rate": 1.7443555604764672e-06,
"loss": 0.0153,
"num_input_tokens_seen": 1071097088,
"step": 1046000
},
{
"epoch": 9.655751469353484,
"grad_norm": 0.29947414994239807,
"learning_rate": 1.7212887867799707e-06,
"loss": 0.014,
"num_input_tokens_seen": 1071609088,
"step": 1046500
},
{
"epoch": 9.660364824092785,
"grad_norm": 0.9218162298202515,
"learning_rate": 1.6982220130834743e-06,
"loss": 0.0141,
"num_input_tokens_seen": 1072121088,
"step": 1047000
},
{
"epoch": 9.664978178832083,
"grad_norm": 1.3005330562591553,
"learning_rate": 1.6751552393869774e-06,
"loss": 0.0145,
"num_input_tokens_seen": 1072633088,
"step": 1047500
},
{
"epoch": 9.669591533571383,
"grad_norm": 1.300002932548523,
"learning_rate": 1.652088465690481e-06,
"loss": 0.0137,
"num_input_tokens_seen": 1073145088,
"step": 1048000
},
{
"epoch": 9.674204888310681,
"grad_norm": 0.6326736211776733,
"learning_rate": 1.6290216919939843e-06,
"loss": 0.0163,
"num_input_tokens_seen": 1073657088,
"step": 1048500
},
{
"epoch": 9.678818243049982,
"grad_norm": 0.865162193775177,
"learning_rate": 1.6059549182974874e-06,
"loss": 0.0137,
"num_input_tokens_seen": 1074169088,
"step": 1049000
},
{
"epoch": 9.68343159778928,
"grad_norm": 0.6226495504379272,
"learning_rate": 1.582888144600991e-06,
"loss": 0.013,
"num_input_tokens_seen": 1074681088,
"step": 1049500
},
{
"epoch": 9.68804495252858,
"grad_norm": 1.6454648971557617,
"learning_rate": 1.5598213709044945e-06,
"loss": 0.012,
"num_input_tokens_seen": 1075193088,
"step": 1050000
},
{
"epoch": 9.692658307267878,
"grad_norm": 0.4671117663383484,
"learning_rate": 1.5367545972079978e-06,
"loss": 0.0133,
"num_input_tokens_seen": 1075705088,
"step": 1050500
},
{
"epoch": 9.697271662007179,
"grad_norm": 0.9937256574630737,
"learning_rate": 1.5136878235115012e-06,
"loss": 0.0142,
"num_input_tokens_seen": 1076217088,
"step": 1051000
},
{
"epoch": 9.701885016746477,
"grad_norm": 0.976679265499115,
"learning_rate": 1.4906210498150045e-06,
"loss": 0.0134,
"num_input_tokens_seen": 1076729088,
"step": 1051500
},
{
"epoch": 9.706498371485777,
"grad_norm": 0.5003361701965332,
"learning_rate": 1.4675542761185078e-06,
"loss": 0.0139,
"num_input_tokens_seen": 1077241088,
"step": 1052000
},
{
"epoch": 9.711111726225077,
"grad_norm": 0.7003839015960693,
"learning_rate": 1.4444875024220114e-06,
"loss": 0.013,
"num_input_tokens_seen": 1077753088,
"step": 1052500
},
{
"epoch": 9.715725080964376,
"grad_norm": 0.6862497925758362,
"learning_rate": 1.4214207287255147e-06,
"loss": 0.0132,
"num_input_tokens_seen": 1078265088,
"step": 1053000
},
{
"epoch": 9.720338435703676,
"grad_norm": 0.26981067657470703,
"learning_rate": 1.398353955029018e-06,
"loss": 0.0124,
"num_input_tokens_seen": 1078777088,
"step": 1053500
},
{
"epoch": 9.724951790442974,
"grad_norm": 0.6135255694389343,
"learning_rate": 1.3752871813325216e-06,
"loss": 0.0133,
"num_input_tokens_seen": 1079289088,
"step": 1054000
},
{
"epoch": 9.729565145182274,
"grad_norm": 0.6279376149177551,
"learning_rate": 1.3522204076360247e-06,
"loss": 0.014,
"num_input_tokens_seen": 1079801088,
"step": 1054500
},
{
"epoch": 9.734178499921573,
"grad_norm": 1.5329886674880981,
"learning_rate": 1.329153633939528e-06,
"loss": 0.0152,
"num_input_tokens_seen": 1080313088,
"step": 1055000
},
{
"epoch": 9.738791854660873,
"grad_norm": 1.2570598125457764,
"learning_rate": 1.3060868602430316e-06,
"loss": 0.0126,
"num_input_tokens_seen": 1080825088,
"step": 1055500
},
{
"epoch": 9.743405209400171,
"grad_norm": 1.8935927152633667,
"learning_rate": 1.283020086546535e-06,
"loss": 0.0138,
"num_input_tokens_seen": 1081337088,
"step": 1056000
},
{
"epoch": 9.748018564139471,
"grad_norm": 0.5364086031913757,
"learning_rate": 1.2599533128500385e-06,
"loss": 0.0136,
"num_input_tokens_seen": 1081849088,
"step": 1056500
},
{
"epoch": 9.75263191887877,
"grad_norm": 0.6562399864196777,
"learning_rate": 1.2368865391535418e-06,
"loss": 0.0128,
"num_input_tokens_seen": 1082361088,
"step": 1057000
},
{
"epoch": 9.75724527361807,
"grad_norm": 0.7584030628204346,
"learning_rate": 1.213819765457045e-06,
"loss": 0.013,
"num_input_tokens_seen": 1082873088,
"step": 1057500
},
{
"epoch": 9.76185862835737,
"grad_norm": 0.8746394515037537,
"learning_rate": 1.1907529917605485e-06,
"loss": 0.014,
"num_input_tokens_seen": 1083385088,
"step": 1058000
},
{
"epoch": 9.766471983096668,
"grad_norm": 1.1132066249847412,
"learning_rate": 1.1676862180640518e-06,
"loss": 0.0129,
"num_input_tokens_seen": 1083897088,
"step": 1058500
},
{
"epoch": 9.771085337835968,
"grad_norm": 0.7786855697631836,
"learning_rate": 1.1446194443675554e-06,
"loss": 0.0137,
"num_input_tokens_seen": 1084409088,
"step": 1059000
},
{
"epoch": 9.775698692575267,
"grad_norm": 0.5935215353965759,
"learning_rate": 1.1215526706710587e-06,
"loss": 0.0144,
"num_input_tokens_seen": 1084921088,
"step": 1059500
},
{
"epoch": 9.780312047314567,
"grad_norm": 1.0187913179397583,
"learning_rate": 1.098485896974562e-06,
"loss": 0.0145,
"num_input_tokens_seen": 1085433088,
"step": 1060000
},
{
"epoch": 9.784925402053865,
"grad_norm": 0.6144331693649292,
"learning_rate": 1.0754191232780654e-06,
"loss": 0.0123,
"num_input_tokens_seen": 1085945088,
"step": 1060500
},
{
"epoch": 9.789538756793165,
"grad_norm": 0.6357366442680359,
"learning_rate": 1.0523523495815687e-06,
"loss": 0.0124,
"num_input_tokens_seen": 1086457088,
"step": 1061000
},
{
"epoch": 9.794152111532464,
"grad_norm": 8.163220405578613,
"learning_rate": 1.0292855758850723e-06,
"loss": 0.0151,
"num_input_tokens_seen": 1086969088,
"step": 1061500
},
{
"epoch": 9.798765466271764,
"grad_norm": 1.1560457944869995,
"learning_rate": 1.0062188021885756e-06,
"loss": 0.0142,
"num_input_tokens_seen": 1087481088,
"step": 1062000
},
{
"epoch": 9.803378821011062,
"grad_norm": 1.6285614967346191,
"learning_rate": 9.83152028492079e-07,
"loss": 0.0124,
"num_input_tokens_seen": 1087993088,
"step": 1062500
},
{
"epoch": 9.807992175750362,
"grad_norm": 0.9213132858276367,
"learning_rate": 9.600852547955823e-07,
"loss": 0.0125,
"num_input_tokens_seen": 1088505088,
"step": 1063000
},
{
"epoch": 9.81260553048966,
"grad_norm": 0.7964446544647217,
"learning_rate": 9.370184810990857e-07,
"loss": 0.0139,
"num_input_tokens_seen": 1089017088,
"step": 1063500
},
{
"epoch": 9.81721888522896,
"grad_norm": 0.8223236799240112,
"learning_rate": 9.139517074025889e-07,
"loss": 0.0128,
"num_input_tokens_seen": 1089529088,
"step": 1064000
},
{
"epoch": 9.821832239968261,
"grad_norm": 0.9797717332839966,
"learning_rate": 8.908849337060925e-07,
"loss": 0.0126,
"num_input_tokens_seen": 1090041088,
"step": 1064500
},
{
"epoch": 9.82644559470756,
"grad_norm": 0.23104320466518402,
"learning_rate": 8.678181600095958e-07,
"loss": 0.0145,
"num_input_tokens_seen": 1090553088,
"step": 1065000
},
{
"epoch": 9.83105894944686,
"grad_norm": 0.5735734105110168,
"learning_rate": 8.447513863130993e-07,
"loss": 0.0146,
"num_input_tokens_seen": 1091065088,
"step": 1065500
},
{
"epoch": 9.835672304186158,
"grad_norm": 0.5744655132293701,
"learning_rate": 8.216846126166026e-07,
"loss": 0.0128,
"num_input_tokens_seen": 1091577088,
"step": 1066000
},
{
"epoch": 9.840285658925458,
"grad_norm": 4.304238319396973,
"learning_rate": 7.986178389201059e-07,
"loss": 0.0124,
"num_input_tokens_seen": 1092089088,
"step": 1066500
},
{
"epoch": 9.844899013664756,
"grad_norm": 0.7492998838424683,
"learning_rate": 7.755510652236094e-07,
"loss": 0.0137,
"num_input_tokens_seen": 1092601088,
"step": 1067000
},
{
"epoch": 9.849512368404056,
"grad_norm": 0.21370269358158112,
"learning_rate": 7.524842915271127e-07,
"loss": 0.0128,
"num_input_tokens_seen": 1093113088,
"step": 1067500
},
{
"epoch": 9.854125723143355,
"grad_norm": 1.3890074491500854,
"learning_rate": 7.294175178306161e-07,
"loss": 0.0133,
"num_input_tokens_seen": 1093625088,
"step": 1068000
},
{
"epoch": 9.858739077882655,
"grad_norm": 0.9255247116088867,
"learning_rate": 7.063507441341195e-07,
"loss": 0.0125,
"num_input_tokens_seen": 1094137088,
"step": 1068500
},
{
"epoch": 9.863352432621953,
"grad_norm": 0.617211639881134,
"learning_rate": 6.832839704376229e-07,
"loss": 0.0118,
"num_input_tokens_seen": 1094649088,
"step": 1069000
},
{
"epoch": 9.867965787361253,
"grad_norm": 0.7818981409072876,
"learning_rate": 6.602171967411263e-07,
"loss": 0.0128,
"num_input_tokens_seen": 1095161088,
"step": 1069500
},
{
"epoch": 9.872579142100552,
"grad_norm": 0.7910097241401672,
"learning_rate": 6.371504230446296e-07,
"loss": 0.0125,
"num_input_tokens_seen": 1095673088,
"step": 1070000
},
{
"epoch": 9.877192496839852,
"grad_norm": 0.9167271256446838,
"learning_rate": 6.14083649348133e-07,
"loss": 0.0145,
"num_input_tokens_seen": 1096185088,
"step": 1070500
},
{
"epoch": 9.881805851579152,
"grad_norm": 0.4515294134616852,
"learning_rate": 5.910168756516364e-07,
"loss": 0.0128,
"num_input_tokens_seen": 1096697088,
"step": 1071000
},
{
"epoch": 9.88641920631845,
"grad_norm": 1.4242569208145142,
"learning_rate": 5.679501019551397e-07,
"loss": 0.0123,
"num_input_tokens_seen": 1097209088,
"step": 1071500
},
{
"epoch": 9.89103256105775,
"grad_norm": 1.5031037330627441,
"learning_rate": 5.448833282586431e-07,
"loss": 0.0132,
"num_input_tokens_seen": 1097721088,
"step": 1072000
},
{
"epoch": 9.895645915797049,
"grad_norm": 0.5102546215057373,
"learning_rate": 5.218165545621465e-07,
"loss": 0.0134,
"num_input_tokens_seen": 1098233088,
"step": 1072500
},
{
"epoch": 9.900259270536349,
"grad_norm": 0.5648242831230164,
"learning_rate": 4.987497808656499e-07,
"loss": 0.0132,
"num_input_tokens_seen": 1098745088,
"step": 1073000
},
{
"epoch": 9.904872625275647,
"grad_norm": 1.368865728378296,
"learning_rate": 4.756830071691533e-07,
"loss": 0.0136,
"num_input_tokens_seen": 1099257088,
"step": 1073500
},
{
"epoch": 9.909485980014948,
"grad_norm": 0.372745156288147,
"learning_rate": 4.5261623347265665e-07,
"loss": 0.0149,
"num_input_tokens_seen": 1099769088,
"step": 1074000
},
{
"epoch": 9.914099334754246,
"grad_norm": 0.5571704506874084,
"learning_rate": 4.2954945977616003e-07,
"loss": 0.0132,
"num_input_tokens_seen": 1100281088,
"step": 1074500
},
{
"epoch": 9.918712689493546,
"grad_norm": 0.44755375385284424,
"learning_rate": 4.064826860796634e-07,
"loss": 0.0138,
"num_input_tokens_seen": 1100793088,
"step": 1075000
},
{
"epoch": 9.923326044232844,
"grad_norm": 0.467204749584198,
"learning_rate": 3.834159123831668e-07,
"loss": 0.0138,
"num_input_tokens_seen": 1101305088,
"step": 1075500
},
{
"epoch": 9.927939398972145,
"grad_norm": 1.1227315664291382,
"learning_rate": 3.603491386866702e-07,
"loss": 0.0131,
"num_input_tokens_seen": 1101817088,
"step": 1076000
},
{
"epoch": 9.932552753711445,
"grad_norm": 0.8583968877792358,
"learning_rate": 3.3728236499017353e-07,
"loss": 0.0143,
"num_input_tokens_seen": 1102329088,
"step": 1076500
},
{
"epoch": 9.937166108450743,
"grad_norm": 0.830702543258667,
"learning_rate": 3.14215591293677e-07,
"loss": 0.0139,
"num_input_tokens_seen": 1102841088,
"step": 1077000
},
{
"epoch": 9.941779463190043,
"grad_norm": 1.864600658416748,
"learning_rate": 2.9114881759718036e-07,
"loss": 0.013,
"num_input_tokens_seen": 1103353088,
"step": 1077500
},
{
"epoch": 9.946392817929341,
"grad_norm": 0.8975169658660889,
"learning_rate": 2.680820439006837e-07,
"loss": 0.0127,
"num_input_tokens_seen": 1103865088,
"step": 1078000
},
{
"epoch": 9.951006172668642,
"grad_norm": 0.7767340540885925,
"learning_rate": 2.450152702041871e-07,
"loss": 0.0132,
"num_input_tokens_seen": 1104377088,
"step": 1078500
},
{
"epoch": 9.95561952740794,
"grad_norm": 0.6193325519561768,
"learning_rate": 2.2194849650769047e-07,
"loss": 0.0144,
"num_input_tokens_seen": 1104889088,
"step": 1079000
},
{
"epoch": 9.96023288214724,
"grad_norm": 1.1023420095443726,
"learning_rate": 1.9888172281119386e-07,
"loss": 0.0141,
"num_input_tokens_seen": 1105401088,
"step": 1079500
},
{
"epoch": 9.964846236886538,
"grad_norm": 1.2743160724639893,
"learning_rate": 1.7581494911469725e-07,
"loss": 0.0119,
"num_input_tokens_seen": 1105913088,
"step": 1080000
},
{
"epoch": 9.969459591625839,
"grad_norm": 0.7009992599487305,
"learning_rate": 1.527481754182006e-07,
"loss": 0.0131,
"num_input_tokens_seen": 1106425088,
"step": 1080500
},
{
"epoch": 9.974072946365137,
"grad_norm": 0.5736069679260254,
"learning_rate": 1.29681401721704e-07,
"loss": 0.0174,
"num_input_tokens_seen": 1106937088,
"step": 1081000
},
{
"epoch": 9.978686301104437,
"grad_norm": 0.4789179861545563,
"learning_rate": 1.0661462802520738e-07,
"loss": 0.0129,
"num_input_tokens_seen": 1107449088,
"step": 1081500
},
{
"epoch": 9.983299655843737,
"grad_norm": 0.7064932584762573,
"learning_rate": 8.354785432871076e-08,
"loss": 0.0122,
"num_input_tokens_seen": 1107961088,
"step": 1082000
},
{
"epoch": 9.987913010583036,
"grad_norm": 1.0066189765930176,
"learning_rate": 6.048108063221414e-08,
"loss": 0.0127,
"num_input_tokens_seen": 1108473088,
"step": 1082500
},
{
"epoch": 9.992526365322336,
"grad_norm": 1.61360502243042,
"learning_rate": 3.7414306935717514e-08,
"loss": 0.0135,
"num_input_tokens_seen": 1108985088,
"step": 1083000
},
{
"epoch": 9.997139720061634,
"grad_norm": 0.37303218245506287,
"learning_rate": 1.4347533239220898e-08,
"loss": 0.0133,
"num_input_tokens_seen": 1109497088,
"step": 1083500
},
{
"epoch": 10.0,
"eval_combined_score": 0.06429717740844736,
"eval_loss": 0.06429717689752579,
"eval_mse": 0.06429717791936893,
"eval_runtime": 46.2743,
"eval_samples_per_second": 2081.892,
"eval_steps_per_second": 260.253,
"num_input_tokens_seen": 1109813760,
"step": 1083810
},
{
"epoch": 10.0,
"num_input_tokens_seen": 1109813760,
"step": 1083810,
"total_flos": 1.4278349548463616e+17,
"train_loss": 0.035630166295778455,
"train_runtime": 37672.0963,
"train_samples_per_second": 230.155,
"train_steps_per_second": 28.77,
"train_tokens_per_second": 29459.836
}
],
"logging_steps": 500,
"max_steps": 1083810,
"num_input_tokens_seen": 1109813760,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4278349548463616e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}