| { |
| "best_global_step": 758667, |
| "best_metric": 0.06412914395332336, |
| "best_model_checkpoint": "/media/user/Expansion1/multilingual-e5-small-aligned-v2-text-quality-v3/checkpoint-758667", |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 1083810, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.004613354739299324, |
| "grad_norm": 0.9464718699455261, |
| "learning_rate": 4.99769793598509e-05, |
| "loss": 0.2108, |
| "num_input_tokens_seen": 512000, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.009226709478598648, |
| "grad_norm": 1.6402217149734497, |
| "learning_rate": 4.99539125861544e-05, |
| "loss": 0.1148, |
| "num_input_tokens_seen": 1024000, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.013840064217897971, |
| "grad_norm": 2.3964197635650635, |
| "learning_rate": 4.9930845812457905e-05, |
| "loss": 0.115, |
| "num_input_tokens_seen": 1536000, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.018453418957197296, |
| "grad_norm": 1.5508780479431152, |
| "learning_rate": 4.990777903876141e-05, |
| "loss": 0.0986, |
| "num_input_tokens_seen": 2048000, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.02306677369649662, |
| "grad_norm": 1.0917489528656006, |
| "learning_rate": 4.9884712265064913e-05, |
| "loss": 0.1006, |
| "num_input_tokens_seen": 2560000, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.027680128435795943, |
| "grad_norm": 3.2608118057250977, |
| "learning_rate": 4.9861645491368414e-05, |
| "loss": 0.0966, |
| "num_input_tokens_seen": 3072000, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.03229348317509526, |
| "grad_norm": 0.6695080995559692, |
| "learning_rate": 4.983857871767192e-05, |
| "loss": 0.0966, |
| "num_input_tokens_seen": 3584000, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.03690683791439459, |
| "grad_norm": 0.9232053756713867, |
| "learning_rate": 4.981551194397542e-05, |
| "loss": 0.0937, |
| "num_input_tokens_seen": 4096000, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.041520192653693916, |
| "grad_norm": 1.5442851781845093, |
| "learning_rate": 4.979244517027893e-05, |
| "loss": 0.0966, |
| "num_input_tokens_seen": 4608000, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.04613354739299324, |
| "grad_norm": 1.1777746677398682, |
| "learning_rate": 4.976937839658243e-05, |
| "loss": 0.0928, |
| "num_input_tokens_seen": 5120000, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.05074690213229256, |
| "grad_norm": 1.1882743835449219, |
| "learning_rate": 4.974631162288593e-05, |
| "loss": 0.0982, |
| "num_input_tokens_seen": 5632000, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.055360256871591886, |
| "grad_norm": 1.9017492532730103, |
| "learning_rate": 4.972324484918944e-05, |
| "loss": 0.0968, |
| "num_input_tokens_seen": 6144000, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.05997361161089121, |
| "grad_norm": 0.9373461008071899, |
| "learning_rate": 4.970017807549294e-05, |
| "loss": 0.0942, |
| "num_input_tokens_seen": 6656000, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.06458696635019053, |
| "grad_norm": 1.4917376041412354, |
| "learning_rate": 4.967711130179644e-05, |
| "loss": 0.0935, |
| "num_input_tokens_seen": 7168000, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.06920032108948986, |
| "grad_norm": 0.534630298614502, |
| "learning_rate": 4.9654044528099946e-05, |
| "loss": 0.0879, |
| "num_input_tokens_seen": 7680000, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.07381367582878919, |
| "grad_norm": 1.9700461626052856, |
| "learning_rate": 4.9630977754403454e-05, |
| "loss": 0.0913, |
| "num_input_tokens_seen": 8192000, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.0784270305680885, |
| "grad_norm": 2.11916446685791, |
| "learning_rate": 4.960791098070695e-05, |
| "loss": 0.0929, |
| "num_input_tokens_seen": 8704000, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.08304038530738783, |
| "grad_norm": 1.961242437362671, |
| "learning_rate": 4.9584844207010455e-05, |
| "loss": 0.0883, |
| "num_input_tokens_seen": 9216000, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.08765374004668715, |
| "grad_norm": 1.5819107294082642, |
| "learning_rate": 4.956177743331396e-05, |
| "loss": 0.0849, |
| "num_input_tokens_seen": 9728000, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.09226709478598648, |
| "grad_norm": 0.8099465370178223, |
| "learning_rate": 4.953871065961746e-05, |
| "loss": 0.0925, |
| "num_input_tokens_seen": 10240000, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.0968804495252858, |
| "grad_norm": 0.8762685656547546, |
| "learning_rate": 4.9515643885920963e-05, |
| "loss": 0.0867, |
| "num_input_tokens_seen": 10752000, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.10149380426458512, |
| "grad_norm": 2.166046142578125, |
| "learning_rate": 4.949257711222447e-05, |
| "loss": 0.0906, |
| "num_input_tokens_seen": 11264000, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.10610715900388444, |
| "grad_norm": 0.9908414483070374, |
| "learning_rate": 4.946951033852797e-05, |
| "loss": 0.0989, |
| "num_input_tokens_seen": 11776000, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.11072051374318377, |
| "grad_norm": 0.9543777704238892, |
| "learning_rate": 4.944644356483147e-05, |
| "loss": 0.0917, |
| "num_input_tokens_seen": 12288000, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.11533386848248309, |
| "grad_norm": 2.302893877029419, |
| "learning_rate": 4.942337679113498e-05, |
| "loss": 0.0906, |
| "num_input_tokens_seen": 12800000, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.11994722322178242, |
| "grad_norm": 1.214758038520813, |
| "learning_rate": 4.940031001743849e-05, |
| "loss": 0.0831, |
| "num_input_tokens_seen": 13312000, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.12456057796108173, |
| "grad_norm": 1.4494785070419312, |
| "learning_rate": 4.937724324374199e-05, |
| "loss": 0.0949, |
| "num_input_tokens_seen": 13824000, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.12917393270038105, |
| "grad_norm": 1.3759499788284302, |
| "learning_rate": 4.935417647004549e-05, |
| "loss": 0.0775, |
| "num_input_tokens_seen": 14336000, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.13378728743968038, |
| "grad_norm": 1.4409326314926147, |
| "learning_rate": 4.9331109696348995e-05, |
| "loss": 0.0874, |
| "num_input_tokens_seen": 14848000, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.1384006421789797, |
| "grad_norm": 0.6916935443878174, |
| "learning_rate": 4.9308042922652496e-05, |
| "loss": 0.0888, |
| "num_input_tokens_seen": 15360000, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.14301399691827904, |
| "grad_norm": 2.6819546222686768, |
| "learning_rate": 4.9284976148956e-05, |
| "loss": 0.0866, |
| "num_input_tokens_seen": 15872000, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.14762735165757837, |
| "grad_norm": 2.243403434753418, |
| "learning_rate": 4.9261909375259504e-05, |
| "loss": 0.0858, |
| "num_input_tokens_seen": 16384000, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.15224070639687767, |
| "grad_norm": 0.6077441573143005, |
| "learning_rate": 4.9238842601563004e-05, |
| "loss": 0.0829, |
| "num_input_tokens_seen": 16896000, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.156854061136177, |
| "grad_norm": 0.7938207387924194, |
| "learning_rate": 4.921577582786651e-05, |
| "loss": 0.0807, |
| "num_input_tokens_seen": 17408000, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.16146741587547633, |
| "grad_norm": 1.7776683568954468, |
| "learning_rate": 4.919270905417002e-05, |
| "loss": 0.0801, |
| "num_input_tokens_seen": 17920000, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.16608077061477566, |
| "grad_norm": 0.9043099880218506, |
| "learning_rate": 4.916964228047351e-05, |
| "loss": 0.0806, |
| "num_input_tokens_seen": 18432000, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.17069412535407497, |
| "grad_norm": 3.0099849700927734, |
| "learning_rate": 4.914657550677702e-05, |
| "loss": 0.0801, |
| "num_input_tokens_seen": 18944000, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.1753074800933743, |
| "grad_norm": 1.3632686138153076, |
| "learning_rate": 4.912350873308053e-05, |
| "loss": 0.0858, |
| "num_input_tokens_seen": 19456000, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.17992083483267363, |
| "grad_norm": 1.3890104293823242, |
| "learning_rate": 4.910044195938403e-05, |
| "loss": 0.0807, |
| "num_input_tokens_seen": 19968000, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.18453418957197296, |
| "grad_norm": 1.393978476524353, |
| "learning_rate": 4.907737518568753e-05, |
| "loss": 0.09, |
| "num_input_tokens_seen": 20480000, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.18914754431127226, |
| "grad_norm": 0.9538819193840027, |
| "learning_rate": 4.9054308411991036e-05, |
| "loss": 0.0862, |
| "num_input_tokens_seen": 20992000, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.1937608990505716, |
| "grad_norm": 1.6974983215332031, |
| "learning_rate": 4.903124163829454e-05, |
| "loss": 0.0778, |
| "num_input_tokens_seen": 21504000, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.19837425378987092, |
| "grad_norm": 0.43043065071105957, |
| "learning_rate": 4.900817486459804e-05, |
| "loss": 0.0927, |
| "num_input_tokens_seen": 22016000, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.20298760852917025, |
| "grad_norm": 0.9475088119506836, |
| "learning_rate": 4.8985108090901545e-05, |
| "loss": 0.0813, |
| "num_input_tokens_seen": 22528000, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.20760096326846958, |
| "grad_norm": 3.547081470489502, |
| "learning_rate": 4.8962041317205045e-05, |
| "loss": 0.0849, |
| "num_input_tokens_seen": 23040000, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.21221431800776888, |
| "grad_norm": 1.2342774868011475, |
| "learning_rate": 4.893897454350855e-05, |
| "loss": 0.0831, |
| "num_input_tokens_seen": 23552000, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.2168276727470682, |
| "grad_norm": 2.133857488632202, |
| "learning_rate": 4.891590776981205e-05, |
| "loss": 0.0774, |
| "num_input_tokens_seen": 24064000, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.22144102748636754, |
| "grad_norm": 2.0566883087158203, |
| "learning_rate": 4.889284099611556e-05, |
| "loss": 0.0778, |
| "num_input_tokens_seen": 24576000, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.22605438222566687, |
| "grad_norm": 0.5913178324699402, |
| "learning_rate": 4.886977422241906e-05, |
| "loss": 0.0811, |
| "num_input_tokens_seen": 25088000, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.23066773696496617, |
| "grad_norm": 1.9674791097640991, |
| "learning_rate": 4.884670744872256e-05, |
| "loss": 0.0743, |
| "num_input_tokens_seen": 25600000, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.2352810917042655, |
| "grad_norm": 0.5584122538566589, |
| "learning_rate": 4.882364067502607e-05, |
| "loss": 0.0852, |
| "num_input_tokens_seen": 26112000, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.23989444644356483, |
| "grad_norm": 1.9229296445846558, |
| "learning_rate": 4.880057390132957e-05, |
| "loss": 0.0828, |
| "num_input_tokens_seen": 26624000, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.24450780118286417, |
| "grad_norm": 1.968058466911316, |
| "learning_rate": 4.877750712763308e-05, |
| "loss": 0.0822, |
| "num_input_tokens_seen": 27136000, |
| "step": 26500 |
| }, |
| { |
| "epoch": 0.24912115592216347, |
| "grad_norm": 1.6034080982208252, |
| "learning_rate": 4.875444035393658e-05, |
| "loss": 0.0822, |
| "num_input_tokens_seen": 27648000, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.2537345106614628, |
| "grad_norm": 1.7301759719848633, |
| "learning_rate": 4.873137358024008e-05, |
| "loss": 0.0833, |
| "num_input_tokens_seen": 28160000, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.2583478654007621, |
| "grad_norm": 2.2902233600616455, |
| "learning_rate": 4.8708306806543585e-05, |
| "loss": 0.0904, |
| "num_input_tokens_seen": 28672000, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.26296122014006146, |
| "grad_norm": 2.805758476257324, |
| "learning_rate": 4.868524003284709e-05, |
| "loss": 0.0854, |
| "num_input_tokens_seen": 29184000, |
| "step": 28500 |
| }, |
| { |
| "epoch": 0.26757457487936076, |
| "grad_norm": 1.0350342988967896, |
| "learning_rate": 4.8662173259150587e-05, |
| "loss": 0.0806, |
| "num_input_tokens_seen": 29696000, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.2721879296186601, |
| "grad_norm": 0.6509085893630981, |
| "learning_rate": 4.8639106485454094e-05, |
| "loss": 0.0846, |
| "num_input_tokens_seen": 30208000, |
| "step": 29500 |
| }, |
| { |
| "epoch": 0.2768012843579594, |
| "grad_norm": 1.2850301265716553, |
| "learning_rate": 4.86160397117576e-05, |
| "loss": 0.0857, |
| "num_input_tokens_seen": 30720000, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.2814146390972587, |
| "grad_norm": 1.7259219884872437, |
| "learning_rate": 4.85929729380611e-05, |
| "loss": 0.0839, |
| "num_input_tokens_seen": 31232000, |
| "step": 30500 |
| }, |
| { |
| "epoch": 0.2860279938365581, |
| "grad_norm": 1.7700318098068237, |
| "learning_rate": 4.85699061643646e-05, |
| "loss": 0.0768, |
| "num_input_tokens_seen": 31744000, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.2906413485758574, |
| "grad_norm": 1.1451270580291748, |
| "learning_rate": 4.854683939066811e-05, |
| "loss": 0.0824, |
| "num_input_tokens_seen": 32256000, |
| "step": 31500 |
| }, |
| { |
| "epoch": 0.29525470331515674, |
| "grad_norm": 1.772096872329712, |
| "learning_rate": 4.852377261697161e-05, |
| "loss": 0.0847, |
| "num_input_tokens_seen": 32768000, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.29986805805445604, |
| "grad_norm": 1.671513557434082, |
| "learning_rate": 4.850070584327511e-05, |
| "loss": 0.0838, |
| "num_input_tokens_seen": 33280000, |
| "step": 32500 |
| }, |
| { |
| "epoch": 0.30448141279375535, |
| "grad_norm": 0.9703548550605774, |
| "learning_rate": 4.847763906957862e-05, |
| "loss": 0.08, |
| "num_input_tokens_seen": 33792000, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.3090947675330547, |
| "grad_norm": 0.7928164601325989, |
| "learning_rate": 4.8454572295882126e-05, |
| "loss": 0.08, |
| "num_input_tokens_seen": 34304000, |
| "step": 33500 |
| }, |
| { |
| "epoch": 0.313708122272354, |
| "grad_norm": 1.1138111352920532, |
| "learning_rate": 4.8431505522185626e-05, |
| "loss": 0.0733, |
| "num_input_tokens_seen": 34816000, |
| "step": 34000 |
| }, |
| { |
| "epoch": 0.3183214770116533, |
| "grad_norm": 0.89890056848526, |
| "learning_rate": 4.840843874848913e-05, |
| "loss": 0.0828, |
| "num_input_tokens_seen": 35328000, |
| "step": 34500 |
| }, |
| { |
| "epoch": 0.32293483175095267, |
| "grad_norm": 2.127382516860962, |
| "learning_rate": 4.8385371974792634e-05, |
| "loss": 0.0818, |
| "num_input_tokens_seen": 35840000, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.32754818649025197, |
| "grad_norm": 1.0730081796646118, |
| "learning_rate": 4.8362305201096135e-05, |
| "loss": 0.0776, |
| "num_input_tokens_seen": 36352000, |
| "step": 35500 |
| }, |
| { |
| "epoch": 0.3321615412295513, |
| "grad_norm": 0.5055031180381775, |
| "learning_rate": 4.833923842739964e-05, |
| "loss": 0.085, |
| "num_input_tokens_seen": 36864000, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.33677489596885063, |
| "grad_norm": 2.764418601989746, |
| "learning_rate": 4.831617165370314e-05, |
| "loss": 0.0795, |
| "num_input_tokens_seen": 37376000, |
| "step": 36500 |
| }, |
| { |
| "epoch": 0.34138825070814993, |
| "grad_norm": 2.272135019302368, |
| "learning_rate": 4.829310488000664e-05, |
| "loss": 0.0757, |
| "num_input_tokens_seen": 37888000, |
| "step": 37000 |
| }, |
| { |
| "epoch": 0.3460016054474493, |
| "grad_norm": 2.2221481800079346, |
| "learning_rate": 4.827003810631015e-05, |
| "loss": 0.0881, |
| "num_input_tokens_seen": 38400000, |
| "step": 37500 |
| }, |
| { |
| "epoch": 0.3506149601867486, |
| "grad_norm": 1.7147547006607056, |
| "learning_rate": 4.824697133261365e-05, |
| "loss": 0.0805, |
| "num_input_tokens_seen": 38912000, |
| "step": 38000 |
| }, |
| { |
| "epoch": 0.35522831492604795, |
| "grad_norm": 2.031804084777832, |
| "learning_rate": 4.822390455891715e-05, |
| "loss": 0.0762, |
| "num_input_tokens_seen": 39424000, |
| "step": 38500 |
| }, |
| { |
| "epoch": 0.35984166966534725, |
| "grad_norm": 0.8008927702903748, |
| "learning_rate": 4.820083778522066e-05, |
| "loss": 0.0794, |
| "num_input_tokens_seen": 39936000, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.36445502440464655, |
| "grad_norm": 1.5696818828582764, |
| "learning_rate": 4.8177771011524167e-05, |
| "loss": 0.0821, |
| "num_input_tokens_seen": 40448000, |
| "step": 39500 |
| }, |
| { |
| "epoch": 0.3690683791439459, |
| "grad_norm": 0.7710667252540588, |
| "learning_rate": 4.815470423782766e-05, |
| "loss": 0.0776, |
| "num_input_tokens_seen": 40960000, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.3736817338832452, |
| "grad_norm": 1.0794172286987305, |
| "learning_rate": 4.813163746413117e-05, |
| "loss": 0.0781, |
| "num_input_tokens_seen": 41472000, |
| "step": 40500 |
| }, |
| { |
| "epoch": 0.3782950886225445, |
| "grad_norm": 2.43756365776062, |
| "learning_rate": 4.8108570690434675e-05, |
| "loss": 0.0787, |
| "num_input_tokens_seen": 41984000, |
| "step": 41000 |
| }, |
| { |
| "epoch": 0.3829084433618439, |
| "grad_norm": 0.6750785112380981, |
| "learning_rate": 4.8085503916738176e-05, |
| "loss": 0.081, |
| "num_input_tokens_seen": 42496000, |
| "step": 41500 |
| }, |
| { |
| "epoch": 0.3875217981011432, |
| "grad_norm": 0.7780609726905823, |
| "learning_rate": 4.8062437143041676e-05, |
| "loss": 0.0791, |
| "num_input_tokens_seen": 43008000, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.39213515284044254, |
| "grad_norm": 1.1585677862167358, |
| "learning_rate": 4.8039370369345184e-05, |
| "loss": 0.0811, |
| "num_input_tokens_seen": 43520000, |
| "step": 42500 |
| }, |
| { |
| "epoch": 0.39674850757974184, |
| "grad_norm": 2.7044448852539062, |
| "learning_rate": 4.8016303595648684e-05, |
| "loss": 0.0775, |
| "num_input_tokens_seen": 44032000, |
| "step": 43000 |
| }, |
| { |
| "epoch": 0.40136186231904114, |
| "grad_norm": 2.9311044216156006, |
| "learning_rate": 4.799323682195219e-05, |
| "loss": 0.0739, |
| "num_input_tokens_seen": 44544000, |
| "step": 43500 |
| }, |
| { |
| "epoch": 0.4059752170583405, |
| "grad_norm": 2.255924940109253, |
| "learning_rate": 4.797017004825569e-05, |
| "loss": 0.0814, |
| "num_input_tokens_seen": 45056000, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.4105885717976398, |
| "grad_norm": 3.5307369232177734, |
| "learning_rate": 4.79471032745592e-05, |
| "loss": 0.0773, |
| "num_input_tokens_seen": 45568000, |
| "step": 44500 |
| }, |
| { |
| "epoch": 0.41520192653693916, |
| "grad_norm": 0.7721351385116577, |
| "learning_rate": 4.79240365008627e-05, |
| "loss": 0.074, |
| "num_input_tokens_seen": 46080000, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.41981528127623846, |
| "grad_norm": 1.668393611907959, |
| "learning_rate": 4.79009697271662e-05, |
| "loss": 0.0763, |
| "num_input_tokens_seen": 46592000, |
| "step": 45500 |
| }, |
| { |
| "epoch": 0.42442863601553776, |
| "grad_norm": 2.3824353218078613, |
| "learning_rate": 4.787790295346971e-05, |
| "loss": 0.0772, |
| "num_input_tokens_seen": 47104000, |
| "step": 46000 |
| }, |
| { |
| "epoch": 0.4290419907548371, |
| "grad_norm": 2.127598762512207, |
| "learning_rate": 4.785483617977321e-05, |
| "loss": 0.0803, |
| "num_input_tokens_seen": 47616000, |
| "step": 46500 |
| }, |
| { |
| "epoch": 0.4336553454941364, |
| "grad_norm": 2.958203077316284, |
| "learning_rate": 4.7831769406076716e-05, |
| "loss": 0.0781, |
| "num_input_tokens_seen": 48128000, |
| "step": 47000 |
| }, |
| { |
| "epoch": 0.4382687002334357, |
| "grad_norm": 0.7533183693885803, |
| "learning_rate": 4.7808702632380217e-05, |
| "loss": 0.0793, |
| "num_input_tokens_seen": 48640000, |
| "step": 47500 |
| }, |
| { |
| "epoch": 0.4428820549727351, |
| "grad_norm": 1.3638031482696533, |
| "learning_rate": 4.778563585868372e-05, |
| "loss": 0.081, |
| "num_input_tokens_seen": 49152000, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.4474954097120344, |
| "grad_norm": 1.3746527433395386, |
| "learning_rate": 4.7762569084987225e-05, |
| "loss": 0.0863, |
| "num_input_tokens_seen": 49664000, |
| "step": 48500 |
| }, |
| { |
| "epoch": 0.45210876445133374, |
| "grad_norm": 1.5628637075424194, |
| "learning_rate": 4.773950231129073e-05, |
| "loss": 0.0799, |
| "num_input_tokens_seen": 50176000, |
| "step": 49000 |
| }, |
| { |
| "epoch": 0.45672211919063305, |
| "grad_norm": 1.8787376880645752, |
| "learning_rate": 4.7716435537594226e-05, |
| "loss": 0.0782, |
| "num_input_tokens_seen": 50688000, |
| "step": 49500 |
| }, |
| { |
| "epoch": 0.46133547392993235, |
| "grad_norm": 1.3804419040679932, |
| "learning_rate": 4.769336876389773e-05, |
| "loss": 0.0833, |
| "num_input_tokens_seen": 51200000, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.4659488286692317, |
| "grad_norm": 1.6135491132736206, |
| "learning_rate": 4.767030199020124e-05, |
| "loss": 0.0762, |
| "num_input_tokens_seen": 51712000, |
| "step": 50500 |
| }, |
| { |
| "epoch": 0.470562183408531, |
| "grad_norm": 2.186791181564331, |
| "learning_rate": 4.7647235216504734e-05, |
| "loss": 0.0797, |
| "num_input_tokens_seen": 52224000, |
| "step": 51000 |
| }, |
| { |
| "epoch": 0.4751755381478303, |
| "grad_norm": 1.6921688318252563, |
| "learning_rate": 4.762416844280824e-05, |
| "loss": 0.0812, |
| "num_input_tokens_seen": 52736000, |
| "step": 51500 |
| }, |
| { |
| "epoch": 0.47978889288712967, |
| "grad_norm": 0.95241379737854, |
| "learning_rate": 4.760110166911175e-05, |
| "loss": 0.0788, |
| "num_input_tokens_seen": 53248000, |
| "step": 52000 |
| }, |
| { |
| "epoch": 0.484402247626429, |
| "grad_norm": 3.2142257690429688, |
| "learning_rate": 4.757803489541525e-05, |
| "loss": 0.0776, |
| "num_input_tokens_seen": 53760000, |
| "step": 52500 |
| }, |
| { |
| "epoch": 0.48901560236572833, |
| "grad_norm": 3.2678260803222656, |
| "learning_rate": 4.755496812171875e-05, |
| "loss": 0.0753, |
| "num_input_tokens_seen": 54272000, |
| "step": 53000 |
| }, |
| { |
| "epoch": 0.49362895710502763, |
| "grad_norm": 2.8343145847320557, |
| "learning_rate": 4.753190134802226e-05, |
| "loss": 0.0784, |
| "num_input_tokens_seen": 54784000, |
| "step": 53500 |
| }, |
| { |
| "epoch": 0.49824231184432693, |
| "grad_norm": 1.4818017482757568, |
| "learning_rate": 4.750883457432576e-05, |
| "loss": 0.0752, |
| "num_input_tokens_seen": 55296000, |
| "step": 54000 |
| }, |
| { |
| "epoch": 0.5028556665836262, |
| "grad_norm": 1.2139348983764648, |
| "learning_rate": 4.7485767800629265e-05, |
| "loss": 0.0734, |
| "num_input_tokens_seen": 55808000, |
| "step": 54500 |
| }, |
| { |
| "epoch": 0.5074690213229256, |
| "grad_norm": 1.3937476873397827, |
| "learning_rate": 4.7462701026932766e-05, |
| "loss": 0.0759, |
| "num_input_tokens_seen": 56320000, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.512082376062225, |
| "grad_norm": 1.7801790237426758, |
| "learning_rate": 4.743963425323627e-05, |
| "loss": 0.0799, |
| "num_input_tokens_seen": 56832000, |
| "step": 55500 |
| }, |
| { |
| "epoch": 0.5166957308015242, |
| "grad_norm": 0.9710603952407837, |
| "learning_rate": 4.7416567479539774e-05, |
| "loss": 0.0705, |
| "num_input_tokens_seen": 57344000, |
| "step": 56000 |
| }, |
| { |
| "epoch": 0.5213090855408236, |
| "grad_norm": 1.3923077583312988, |
| "learning_rate": 4.739350070584328e-05, |
| "loss": 0.0778, |
| "num_input_tokens_seen": 57856000, |
| "step": 56500 |
| }, |
| { |
| "epoch": 0.5259224402801229, |
| "grad_norm": 0.5901740193367004, |
| "learning_rate": 4.737043393214678e-05, |
| "loss": 0.0729, |
| "num_input_tokens_seen": 58368000, |
| "step": 57000 |
| }, |
| { |
| "epoch": 0.5305357950194223, |
| "grad_norm": 1.3465195894241333, |
| "learning_rate": 4.734736715845028e-05, |
| "loss": 0.0797, |
| "num_input_tokens_seen": 58880000, |
| "step": 57500 |
| }, |
| { |
| "epoch": 0.5351491497587215, |
| "grad_norm": 0.48033392429351807, |
| "learning_rate": 4.732430038475379e-05, |
| "loss": 0.0736, |
| "num_input_tokens_seen": 59392000, |
| "step": 58000 |
| }, |
| { |
| "epoch": 0.5397625044980209, |
| "grad_norm": 1.3446660041809082, |
| "learning_rate": 4.730123361105729e-05, |
| "loss": 0.0778, |
| "num_input_tokens_seen": 59904000, |
| "step": 58500 |
| }, |
| { |
| "epoch": 0.5443758592373202, |
| "grad_norm": 0.895521342754364, |
| "learning_rate": 4.727816683736079e-05, |
| "loss": 0.0754, |
| "num_input_tokens_seen": 60416000, |
| "step": 59000 |
| }, |
| { |
| "epoch": 0.5489892139766195, |
| "grad_norm": 1.3843989372253418, |
| "learning_rate": 4.72551000636643e-05, |
| "loss": 0.0817, |
| "num_input_tokens_seen": 60928000, |
| "step": 59500 |
| }, |
| { |
| "epoch": 0.5536025687159188, |
| "grad_norm": 1.5670028924942017, |
| "learning_rate": 4.7232033289967806e-05, |
| "loss": 0.0742, |
| "num_input_tokens_seen": 61440000, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.5582159234552182, |
| "grad_norm": 1.4761849641799927, |
| "learning_rate": 4.72089665162713e-05, |
| "loss": 0.0688, |
| "num_input_tokens_seen": 61952000, |
| "step": 60500 |
| }, |
| { |
| "epoch": 0.5628292781945174, |
| "grad_norm": 6.005481719970703, |
| "learning_rate": 4.718589974257481e-05, |
| "loss": 0.0836, |
| "num_input_tokens_seen": 62464000, |
| "step": 61000 |
| }, |
| { |
| "epoch": 0.5674426329338168, |
| "grad_norm": 1.2835499048233032, |
| "learning_rate": 4.7162832968878314e-05, |
| "loss": 0.0731, |
| "num_input_tokens_seen": 62976000, |
| "step": 61500 |
| }, |
| { |
| "epoch": 0.5720559876731162, |
| "grad_norm": 1.769403338432312, |
| "learning_rate": 4.7139766195181815e-05, |
| "loss": 0.079, |
| "num_input_tokens_seen": 63488000, |
| "step": 62000 |
| }, |
| { |
| "epoch": 0.5766693424124154, |
| "grad_norm": 1.8391185998916626, |
| "learning_rate": 4.7116699421485315e-05, |
| "loss": 0.082, |
| "num_input_tokens_seen": 64000000, |
| "step": 62500 |
| }, |
| { |
| "epoch": 0.5812826971517148, |
| "grad_norm": 1.3075145483016968, |
| "learning_rate": 4.709363264778882e-05, |
| "loss": 0.0753, |
| "num_input_tokens_seen": 64512000, |
| "step": 63000 |
| }, |
| { |
| "epoch": 0.5858960518910141, |
| "grad_norm": 2.2406928539276123, |
| "learning_rate": 4.707056587409232e-05, |
| "loss": 0.0737, |
| "num_input_tokens_seen": 65024000, |
| "step": 63500 |
| }, |
| { |
| "epoch": 0.5905094066303135, |
| "grad_norm": 2.2750511169433594, |
| "learning_rate": 4.7047499100395824e-05, |
| "loss": 0.077, |
| "num_input_tokens_seen": 65536000, |
| "step": 64000 |
| }, |
| { |
| "epoch": 0.5951227613696127, |
| "grad_norm": 1.7060987949371338, |
| "learning_rate": 4.702443232669933e-05, |
| "loss": 0.0764, |
| "num_input_tokens_seen": 66048000, |
| "step": 64500 |
| }, |
| { |
| "epoch": 0.5997361161089121, |
| "grad_norm": 1.3420023918151855, |
| "learning_rate": 4.700136555300283e-05, |
| "loss": 0.0803, |
| "num_input_tokens_seen": 66560000, |
| "step": 65000 |
| }, |
| { |
| "epoch": 0.6043494708482114, |
| "grad_norm": 0.8915556073188782, |
| "learning_rate": 4.697829877930634e-05, |
| "loss": 0.0765, |
| "num_input_tokens_seen": 67072000, |
| "step": 65500 |
| }, |
| { |
| "epoch": 0.6089628255875107, |
| "grad_norm": 2.3567070960998535, |
| "learning_rate": 4.695523200560984e-05, |
| "loss": 0.0739, |
| "num_input_tokens_seen": 67584000, |
| "step": 66000 |
| }, |
| { |
| "epoch": 0.61357618032681, |
| "grad_norm": 1.8976528644561768, |
| "learning_rate": 4.693216523191335e-05, |
| "loss": 0.0738, |
| "num_input_tokens_seen": 68096000, |
| "step": 66500 |
| }, |
| { |
| "epoch": 0.6181895350661094, |
| "grad_norm": 2.0413930416107178, |
| "learning_rate": 4.690909845821685e-05, |
| "loss": 0.0826, |
| "num_input_tokens_seen": 68608000, |
| "step": 67000 |
| }, |
| { |
| "epoch": 0.6228028898054087, |
| "grad_norm": 4.672994613647461, |
| "learning_rate": 4.6886031684520355e-05, |
| "loss": 0.0773, |
| "num_input_tokens_seen": 69120000, |
| "step": 67500 |
| }, |
| { |
| "epoch": 0.627416244544708, |
| "grad_norm": 1.1743087768554688, |
| "learning_rate": 4.6862964910823856e-05, |
| "loss": 0.0745, |
| "num_input_tokens_seen": 69632000, |
| "step": 68000 |
| }, |
| { |
| "epoch": 0.6320295992840074, |
| "grad_norm": 0.7749766707420349, |
| "learning_rate": 4.6839898137127356e-05, |
| "loss": 0.0738, |
| "num_input_tokens_seen": 70144000, |
| "step": 68500 |
| }, |
| { |
| "epoch": 0.6366429540233066, |
| "grad_norm": 0.5075979232788086, |
| "learning_rate": 4.6816831363430864e-05, |
| "loss": 0.0747, |
| "num_input_tokens_seen": 70656000, |
| "step": 69000 |
| }, |
| { |
| "epoch": 0.641256308762606, |
| "grad_norm": 2.802272081375122, |
| "learning_rate": 4.679376458973437e-05, |
| "loss": 0.0825, |
| "num_input_tokens_seen": 71168000, |
| "step": 69500 |
| }, |
| { |
| "epoch": 0.6458696635019053, |
| "grad_norm": 1.798438549041748, |
| "learning_rate": 4.6770697816037865e-05, |
| "loss": 0.0766, |
| "num_input_tokens_seen": 71680000, |
| "step": 70000 |
| }, |
| { |
| "epoch": 0.6504830182412047, |
| "grad_norm": 1.7648403644561768, |
| "learning_rate": 4.674763104234137e-05, |
| "loss": 0.077, |
| "num_input_tokens_seen": 72192000, |
| "step": 70500 |
| }, |
| { |
| "epoch": 0.6550963729805039, |
| "grad_norm": 2.0195560455322266, |
| "learning_rate": 4.672456426864488e-05, |
| "loss": 0.0767, |
| "num_input_tokens_seen": 72704000, |
| "step": 71000 |
| }, |
| { |
| "epoch": 0.6597097277198033, |
| "grad_norm": 3.9862349033355713, |
| "learning_rate": 4.670149749494837e-05, |
| "loss": 0.0745, |
| "num_input_tokens_seen": 73216000, |
| "step": 71500 |
| }, |
| { |
| "epoch": 0.6643230824591027, |
| "grad_norm": 2.7226781845092773, |
| "learning_rate": 4.667843072125188e-05, |
| "loss": 0.0703, |
| "num_input_tokens_seen": 73728000, |
| "step": 72000 |
| }, |
| { |
| "epoch": 0.6689364371984019, |
| "grad_norm": 2.0484044551849365, |
| "learning_rate": 4.665536394755539e-05, |
| "loss": 0.0765, |
| "num_input_tokens_seen": 74240000, |
| "step": 72500 |
| }, |
| { |
| "epoch": 0.6735497919377013, |
| "grad_norm": 0.4825538694858551, |
| "learning_rate": 4.663229717385889e-05, |
| "loss": 0.0823, |
| "num_input_tokens_seen": 74752000, |
| "step": 73000 |
| }, |
| { |
| "epoch": 0.6781631466770006, |
| "grad_norm": 1.2127926349639893, |
| "learning_rate": 4.660923040016239e-05, |
| "loss": 0.0754, |
| "num_input_tokens_seen": 75264000, |
| "step": 73500 |
| }, |
| { |
| "epoch": 0.6827765014162999, |
| "grad_norm": 3.139049768447876, |
| "learning_rate": 4.6586163626465897e-05, |
| "loss": 0.0749, |
| "num_input_tokens_seen": 75776000, |
| "step": 74000 |
| }, |
| { |
| "epoch": 0.6873898561555992, |
| "grad_norm": 2.038872480392456, |
| "learning_rate": 4.65630968527694e-05, |
| "loss": 0.0753, |
| "num_input_tokens_seen": 76288000, |
| "step": 74500 |
| }, |
| { |
| "epoch": 0.6920032108948986, |
| "grad_norm": 4.1413469314575195, |
| "learning_rate": 4.6540030079072904e-05, |
| "loss": 0.0761, |
| "num_input_tokens_seen": 76800000, |
| "step": 75000 |
| }, |
| { |
| "epoch": 0.6966165656341978, |
| "grad_norm": 1.3078006505966187, |
| "learning_rate": 4.6516963305376405e-05, |
| "loss": 0.0766, |
| "num_input_tokens_seen": 77312000, |
| "step": 75500 |
| }, |
| { |
| "epoch": 0.7012299203734972, |
| "grad_norm": 1.2052334547042847, |
| "learning_rate": 4.649389653167991e-05, |
| "loss": 0.0749, |
| "num_input_tokens_seen": 77824000, |
| "step": 76000 |
| }, |
| { |
| "epoch": 0.7058432751127965, |
| "grad_norm": 1.5266985893249512, |
| "learning_rate": 4.647082975798341e-05, |
| "loss": 0.0768, |
| "num_input_tokens_seen": 78336000, |
| "step": 76500 |
| }, |
| { |
| "epoch": 0.7104566298520959, |
| "grad_norm": 13.878520011901855, |
| "learning_rate": 4.6447762984286914e-05, |
| "loss": 0.0813, |
| "num_input_tokens_seen": 78848000, |
| "step": 77000 |
| }, |
| { |
| "epoch": 0.7150699845913951, |
| "grad_norm": 0.8548376560211182, |
| "learning_rate": 4.642469621059042e-05, |
| "loss": 0.0693, |
| "num_input_tokens_seen": 79360000, |
| "step": 77500 |
| }, |
| { |
| "epoch": 0.7196833393306945, |
| "grad_norm": 1.8979346752166748, |
| "learning_rate": 4.640162943689392e-05, |
| "loss": 0.0795, |
| "num_input_tokens_seen": 79872000, |
| "step": 78000 |
| }, |
| { |
| "epoch": 0.7242966940699939, |
| "grad_norm": 0.6193153262138367, |
| "learning_rate": 4.637856266319743e-05, |
| "loss": 0.0776, |
| "num_input_tokens_seen": 80384000, |
| "step": 78500 |
| }, |
| { |
| "epoch": 0.7289100488092931, |
| "grad_norm": 1.736380934715271, |
| "learning_rate": 4.635549588950093e-05, |
| "loss": 0.079, |
| "num_input_tokens_seen": 80896000, |
| "step": 79000 |
| }, |
| { |
| "epoch": 0.7335234035485925, |
| "grad_norm": 3.559295415878296, |
| "learning_rate": 4.633242911580443e-05, |
| "loss": 0.0792, |
| "num_input_tokens_seen": 81408000, |
| "step": 79500 |
| }, |
| { |
| "epoch": 0.7381367582878918, |
| "grad_norm": 1.017986536026001, |
| "learning_rate": 4.630936234210794e-05, |
| "loss": 0.0782, |
| "num_input_tokens_seen": 81920000, |
| "step": 80000 |
| }, |
| { |
| "epoch": 0.7427501130271911, |
| "grad_norm": 1.2457808256149292, |
| "learning_rate": 4.6286295568411445e-05, |
| "loss": 0.0766, |
| "num_input_tokens_seen": 82432000, |
| "step": 80500 |
| }, |
| { |
| "epoch": 0.7473634677664904, |
| "grad_norm": 0.6746057271957397, |
| "learning_rate": 4.626322879471494e-05, |
| "loss": 0.0728, |
| "num_input_tokens_seen": 82944000, |
| "step": 81000 |
| }, |
| { |
| "epoch": 0.7519768225057898, |
| "grad_norm": 1.1048623323440552, |
| "learning_rate": 4.6240162021018446e-05, |
| "loss": 0.0763, |
| "num_input_tokens_seen": 83456000, |
| "step": 81500 |
| }, |
| { |
| "epoch": 0.756590177245089, |
| "grad_norm": 2.0804615020751953, |
| "learning_rate": 4.621709524732195e-05, |
| "loss": 0.0736, |
| "num_input_tokens_seen": 83968000, |
| "step": 82000 |
| }, |
| { |
| "epoch": 0.7612035319843884, |
| "grad_norm": 0.7726876735687256, |
| "learning_rate": 4.6194028473625454e-05, |
| "loss": 0.0756, |
| "num_input_tokens_seen": 84480000, |
| "step": 82500 |
| }, |
| { |
| "epoch": 0.7658168867236878, |
| "grad_norm": 1.618414044380188, |
| "learning_rate": 4.6170961699928954e-05, |
| "loss": 0.0736, |
| "num_input_tokens_seen": 84992000, |
| "step": 83000 |
| }, |
| { |
| "epoch": 0.7704302414629871, |
| "grad_norm": 0.2806508243083954, |
| "learning_rate": 4.614789492623246e-05, |
| "loss": 0.0757, |
| "num_input_tokens_seen": 85504000, |
| "step": 83500 |
| }, |
| { |
| "epoch": 0.7750435962022864, |
| "grad_norm": 1.093205451965332, |
| "learning_rate": 4.612482815253596e-05, |
| "loss": 0.0746, |
| "num_input_tokens_seen": 86016000, |
| "step": 84000 |
| }, |
| { |
| "epoch": 0.7796569509415857, |
| "grad_norm": 0.8395510911941528, |
| "learning_rate": 4.610176137883946e-05, |
| "loss": 0.0728, |
| "num_input_tokens_seen": 86528000, |
| "step": 84500 |
| }, |
| { |
| "epoch": 0.7842703056808851, |
| "grad_norm": 5.429121017456055, |
| "learning_rate": 4.607869460514297e-05, |
| "loss": 0.0752, |
| "num_input_tokens_seen": 87040000, |
| "step": 85000 |
| }, |
| { |
| "epoch": 0.7888836604201843, |
| "grad_norm": 1.0684977769851685, |
| "learning_rate": 4.605562783144647e-05, |
| "loss": 0.0734, |
| "num_input_tokens_seen": 87552000, |
| "step": 85500 |
| }, |
| { |
| "epoch": 0.7934970151594837, |
| "grad_norm": 4.412910461425781, |
| "learning_rate": 4.603256105774998e-05, |
| "loss": 0.0724, |
| "num_input_tokens_seen": 88064000, |
| "step": 86000 |
| }, |
| { |
| "epoch": 0.798110369898783, |
| "grad_norm": 1.352186918258667, |
| "learning_rate": 4.600949428405348e-05, |
| "loss": 0.0752, |
| "num_input_tokens_seen": 88576000, |
| "step": 86500 |
| }, |
| { |
| "epoch": 0.8027237246380823, |
| "grad_norm": 3.716979742050171, |
| "learning_rate": 4.5986427510356986e-05, |
| "loss": 0.0712, |
| "num_input_tokens_seen": 89088000, |
| "step": 87000 |
| }, |
| { |
| "epoch": 0.8073370793773816, |
| "grad_norm": 1.6584104299545288, |
| "learning_rate": 4.596336073666049e-05, |
| "loss": 0.0733, |
| "num_input_tokens_seen": 89600000, |
| "step": 87500 |
| }, |
| { |
| "epoch": 0.811950434116681, |
| "grad_norm": 2.3811452388763428, |
| "learning_rate": 4.5940293962963994e-05, |
| "loss": 0.0763, |
| "num_input_tokens_seen": 90112000, |
| "step": 88000 |
| }, |
| { |
| "epoch": 0.8165637888559802, |
| "grad_norm": 1.4352256059646606, |
| "learning_rate": 4.5917227189267495e-05, |
| "loss": 0.0696, |
| "num_input_tokens_seen": 90624000, |
| "step": 88500 |
| }, |
| { |
| "epoch": 0.8211771435952796, |
| "grad_norm": 2.95996356010437, |
| "learning_rate": 4.5894160415570995e-05, |
| "loss": 0.0675, |
| "num_input_tokens_seen": 91136000, |
| "step": 89000 |
| }, |
| { |
| "epoch": 0.825790498334579, |
| "grad_norm": 1.790480375289917, |
| "learning_rate": 4.58710936418745e-05, |
| "loss": 0.0737, |
| "num_input_tokens_seen": 91648000, |
| "step": 89500 |
| }, |
| { |
| "epoch": 0.8304038530738783, |
| "grad_norm": 2.4636244773864746, |
| "learning_rate": 4.5848026868178e-05, |
| "loss": 0.0725, |
| "num_input_tokens_seen": 92160000, |
| "step": 90000 |
| }, |
| { |
| "epoch": 0.8350172078131776, |
| "grad_norm": 1.4085214138031006, |
| "learning_rate": 4.5824960094481504e-05, |
| "loss": 0.0801, |
| "num_input_tokens_seen": 92672000, |
| "step": 90500 |
| }, |
| { |
| "epoch": 0.8396305625524769, |
| "grad_norm": 1.5080194473266602, |
| "learning_rate": 4.580189332078501e-05, |
| "loss": 0.0707, |
| "num_input_tokens_seen": 93184000, |
| "step": 91000 |
| }, |
| { |
| "epoch": 0.8442439172917763, |
| "grad_norm": 0.8035141229629517, |
| "learning_rate": 4.577882654708852e-05, |
| "loss": 0.0775, |
| "num_input_tokens_seen": 93696000, |
| "step": 91500 |
| }, |
| { |
| "epoch": 0.8488572720310755, |
| "grad_norm": 1.832581639289856, |
| "learning_rate": 4.575575977339201e-05, |
| "loss": 0.076, |
| "num_input_tokens_seen": 94208000, |
| "step": 92000 |
| }, |
| { |
| "epoch": 0.8534706267703749, |
| "grad_norm": 0.5887289047241211, |
| "learning_rate": 4.573269299969552e-05, |
| "loss": 0.0752, |
| "num_input_tokens_seen": 94720000, |
| "step": 92500 |
| }, |
| { |
| "epoch": 0.8580839815096742, |
| "grad_norm": 0.7849867939949036, |
| "learning_rate": 4.570962622599903e-05, |
| "loss": 0.0815, |
| "num_input_tokens_seen": 95232000, |
| "step": 93000 |
| }, |
| { |
| "epoch": 0.8626973362489735, |
| "grad_norm": 2.76053524017334, |
| "learning_rate": 4.568655945230253e-05, |
| "loss": 0.0696, |
| "num_input_tokens_seen": 95744000, |
| "step": 93500 |
| }, |
| { |
| "epoch": 0.8673106909882728, |
| "grad_norm": 0.608044445514679, |
| "learning_rate": 4.566349267860603e-05, |
| "loss": 0.0764, |
| "num_input_tokens_seen": 96256000, |
| "step": 94000 |
| }, |
| { |
| "epoch": 0.8719240457275722, |
| "grad_norm": 2.4751555919647217, |
| "learning_rate": 4.5640425904909536e-05, |
| "loss": 0.0706, |
| "num_input_tokens_seen": 96768000, |
| "step": 94500 |
| }, |
| { |
| "epoch": 0.8765374004668715, |
| "grad_norm": 0.5605325698852539, |
| "learning_rate": 4.5617359131213036e-05, |
| "loss": 0.074, |
| "num_input_tokens_seen": 97280000, |
| "step": 95000 |
| }, |
| { |
| "epoch": 0.8811507552061708, |
| "grad_norm": 2.0805656909942627, |
| "learning_rate": 4.5594292357516544e-05, |
| "loss": 0.0723, |
| "num_input_tokens_seen": 97792000, |
| "step": 95500 |
| }, |
| { |
| "epoch": 0.8857641099454702, |
| "grad_norm": 0.8538010120391846, |
| "learning_rate": 4.5571225583820044e-05, |
| "loss": 0.0755, |
| "num_input_tokens_seen": 98304000, |
| "step": 96000 |
| }, |
| { |
| "epoch": 0.8903774646847694, |
| "grad_norm": 0.7344834804534912, |
| "learning_rate": 4.5548158810123545e-05, |
| "loss": 0.0722, |
| "num_input_tokens_seen": 98816000, |
| "step": 96500 |
| }, |
| { |
| "epoch": 0.8949908194240688, |
| "grad_norm": 0.9666327238082886, |
| "learning_rate": 4.552509203642705e-05, |
| "loss": 0.0777, |
| "num_input_tokens_seen": 99328000, |
| "step": 97000 |
| }, |
| { |
| "epoch": 0.8996041741633681, |
| "grad_norm": 1.5512099266052246, |
| "learning_rate": 4.550202526273055e-05, |
| "loss": 0.0751, |
| "num_input_tokens_seen": 99840000, |
| "step": 97500 |
| }, |
| { |
| "epoch": 0.9042175289026675, |
| "grad_norm": 0.9923927187919617, |
| "learning_rate": 4.547895848903406e-05, |
| "loss": 0.073, |
| "num_input_tokens_seen": 100352000, |
| "step": 98000 |
| }, |
| { |
| "epoch": 0.9088308836419667, |
| "grad_norm": 1.5789976119995117, |
| "learning_rate": 4.545589171533756e-05, |
| "loss": 0.068, |
| "num_input_tokens_seen": 100864000, |
| "step": 98500 |
| }, |
| { |
| "epoch": 0.9134442383812661, |
| "grad_norm": 0.3622562885284424, |
| "learning_rate": 4.543282494164107e-05, |
| "loss": 0.0711, |
| "num_input_tokens_seen": 101376000, |
| "step": 99000 |
| }, |
| { |
| "epoch": 0.9180575931205655, |
| "grad_norm": 1.9762753248214722, |
| "learning_rate": 4.540975816794457e-05, |
| "loss": 0.0678, |
| "num_input_tokens_seen": 101888000, |
| "step": 99500 |
| }, |
| { |
| "epoch": 0.9226709478598647, |
| "grad_norm": 2.144947052001953, |
| "learning_rate": 4.538669139424807e-05, |
| "loss": 0.0705, |
| "num_input_tokens_seen": 102400000, |
| "step": 100000 |
| }, |
| { |
| "epoch": 0.9272843025991641, |
| "grad_norm": 0.5793939232826233, |
| "learning_rate": 4.5363624620551576e-05, |
| "loss": 0.0798, |
| "num_input_tokens_seen": 102912000, |
| "step": 100500 |
| }, |
| { |
| "epoch": 0.9318976573384634, |
| "grad_norm": 1.8652976751327515, |
| "learning_rate": 4.5340557846855084e-05, |
| "loss": 0.0723, |
| "num_input_tokens_seen": 103424000, |
| "step": 101000 |
| }, |
| { |
| "epoch": 0.9365110120777627, |
| "grad_norm": 1.8371716737747192, |
| "learning_rate": 4.531749107315858e-05, |
| "loss": 0.0752, |
| "num_input_tokens_seen": 103936000, |
| "step": 101500 |
| }, |
| { |
| "epoch": 0.941124366817062, |
| "grad_norm": 1.0695359706878662, |
| "learning_rate": 4.5294424299462085e-05, |
| "loss": 0.0786, |
| "num_input_tokens_seen": 104448000, |
| "step": 102000 |
| }, |
| { |
| "epoch": 0.9457377215563614, |
| "grad_norm": 1.6259958744049072, |
| "learning_rate": 4.527135752576559e-05, |
| "loss": 0.0726, |
| "num_input_tokens_seen": 104960000, |
| "step": 102500 |
| }, |
| { |
| "epoch": 0.9503510762956606, |
| "grad_norm": 2.0838193893432617, |
| "learning_rate": 4.5248290752069086e-05, |
| "loss": 0.0729, |
| "num_input_tokens_seen": 105472000, |
| "step": 103000 |
| }, |
| { |
| "epoch": 0.95496443103496, |
| "grad_norm": 1.8072469234466553, |
| "learning_rate": 4.5225223978372593e-05, |
| "loss": 0.0725, |
| "num_input_tokens_seen": 105984000, |
| "step": 103500 |
| }, |
| { |
| "epoch": 0.9595777857742593, |
| "grad_norm": 1.4469674825668335, |
| "learning_rate": 4.52021572046761e-05, |
| "loss": 0.0762, |
| "num_input_tokens_seen": 106496000, |
| "step": 104000 |
| }, |
| { |
| "epoch": 0.9641911405135587, |
| "grad_norm": 0.8151160478591919, |
| "learning_rate": 4.51790904309796e-05, |
| "loss": 0.0713, |
| "num_input_tokens_seen": 107008000, |
| "step": 104500 |
| }, |
| { |
| "epoch": 0.968804495252858, |
| "grad_norm": 2.5363306999206543, |
| "learning_rate": 4.51560236572831e-05, |
| "loss": 0.0717, |
| "num_input_tokens_seen": 107520000, |
| "step": 105000 |
| }, |
| { |
| "epoch": 0.9734178499921573, |
| "grad_norm": 2.3089513778686523, |
| "learning_rate": 4.513295688358661e-05, |
| "loss": 0.075, |
| "num_input_tokens_seen": 108032000, |
| "step": 105500 |
| }, |
| { |
| "epoch": 0.9780312047314567, |
| "grad_norm": 1.2738145589828491, |
| "learning_rate": 4.510989010989011e-05, |
| "loss": 0.0739, |
| "num_input_tokens_seen": 108544000, |
| "step": 106000 |
| }, |
| { |
| "epoch": 0.9826445594707559, |
| "grad_norm": 0.9310311675071716, |
| "learning_rate": 4.508682333619362e-05, |
| "loss": 0.0715, |
| "num_input_tokens_seen": 109056000, |
| "step": 106500 |
| }, |
| { |
| "epoch": 0.9872579142100553, |
| "grad_norm": 1.332413911819458, |
| "learning_rate": 4.506375656249712e-05, |
| "loss": 0.0762, |
| "num_input_tokens_seen": 109568000, |
| "step": 107000 |
| }, |
| { |
| "epoch": 0.9918712689493546, |
| "grad_norm": 1.171770691871643, |
| "learning_rate": 4.504068978880062e-05, |
| "loss": 0.0682, |
| "num_input_tokens_seen": 110080000, |
| "step": 107500 |
| }, |
| { |
| "epoch": 0.9964846236886539, |
| "grad_norm": 1.318642497062683, |
| "learning_rate": 4.5017623015104126e-05, |
| "loss": 0.0725, |
| "num_input_tokens_seen": 110592000, |
| "step": 108000 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_combined_score": 0.07267016709729579, |
| "eval_loss": 0.07267016172409058, |
| "eval_mse": 0.07267016501992041, |
| "eval_runtime": 46.4186, |
| "eval_samples_per_second": 2075.42, |
| "eval_steps_per_second": 259.444, |
| "num_input_tokens_seen": 110981376, |
| "step": 108381 |
| }, |
| { |
| "epoch": 1.0010979784279532, |
| "grad_norm": 2.0301551818847656, |
| "learning_rate": 4.499455624140763e-05, |
| "loss": 0.0723, |
| "num_input_tokens_seen": 111103232, |
| "step": 108500 |
| }, |
| { |
| "epoch": 1.0057113331672525, |
| "grad_norm": 0.46064960956573486, |
| "learning_rate": 4.4971489467711134e-05, |
| "loss": 0.066, |
| "num_input_tokens_seen": 111615232, |
| "step": 109000 |
| }, |
| { |
| "epoch": 1.010324687906552, |
| "grad_norm": 2.481804132461548, |
| "learning_rate": 4.4948422694014634e-05, |
| "loss": 0.0567, |
| "num_input_tokens_seen": 112127232, |
| "step": 109500 |
| }, |
| { |
| "epoch": 1.0149380426458512, |
| "grad_norm": 1.0883979797363281, |
| "learning_rate": 4.492535592031814e-05, |
| "loss": 0.0591, |
| "num_input_tokens_seen": 112639232, |
| "step": 110000 |
| }, |
| { |
| "epoch": 1.0195513973851504, |
| "grad_norm": 1.5821534395217896, |
| "learning_rate": 4.490228914662164e-05, |
| "loss": 0.0575, |
| "num_input_tokens_seen": 113151232, |
| "step": 110500 |
| }, |
| { |
| "epoch": 1.02416475212445, |
| "grad_norm": 1.1834355592727661, |
| "learning_rate": 4.487922237292514e-05, |
| "loss": 0.0643, |
| "num_input_tokens_seen": 113663232, |
| "step": 111000 |
| }, |
| { |
| "epoch": 1.0287781068637492, |
| "grad_norm": 0.5016165375709534, |
| "learning_rate": 4.485615559922865e-05, |
| "loss": 0.0598, |
| "num_input_tokens_seen": 114175232, |
| "step": 111500 |
| }, |
| { |
| "epoch": 1.0333914616030484, |
| "grad_norm": 2.372044086456299, |
| "learning_rate": 4.483308882553216e-05, |
| "loss": 0.0608, |
| "num_input_tokens_seen": 114687232, |
| "step": 112000 |
| }, |
| { |
| "epoch": 1.0380048163423479, |
| "grad_norm": 1.4434441328048706, |
| "learning_rate": 4.481002205183565e-05, |
| "loss": 0.059, |
| "num_input_tokens_seen": 115199232, |
| "step": 112500 |
| }, |
| { |
| "epoch": 1.0426181710816471, |
| "grad_norm": 1.329825520515442, |
| "learning_rate": 4.478695527813916e-05, |
| "loss": 0.061, |
| "num_input_tokens_seen": 115711232, |
| "step": 113000 |
| }, |
| { |
| "epoch": 1.0472315258209464, |
| "grad_norm": 0.6627879738807678, |
| "learning_rate": 4.4763888504442666e-05, |
| "loss": 0.0562, |
| "num_input_tokens_seen": 116223232, |
| "step": 113500 |
| }, |
| { |
| "epoch": 1.0518448805602458, |
| "grad_norm": 1.4965338706970215, |
| "learning_rate": 4.474082173074617e-05, |
| "loss": 0.0614, |
| "num_input_tokens_seen": 116735232, |
| "step": 114000 |
| }, |
| { |
| "epoch": 1.056458235299545, |
| "grad_norm": 4.595455646514893, |
| "learning_rate": 4.471775495704967e-05, |
| "loss": 0.0569, |
| "num_input_tokens_seen": 117247232, |
| "step": 114500 |
| }, |
| { |
| "epoch": 1.0610715900388445, |
| "grad_norm": 1.5899192094802856, |
| "learning_rate": 4.4694688183353175e-05, |
| "loss": 0.058, |
| "num_input_tokens_seen": 117759232, |
| "step": 115000 |
| }, |
| { |
| "epoch": 1.0656849447781438, |
| "grad_norm": 1.812812328338623, |
| "learning_rate": 4.4671621409656675e-05, |
| "loss": 0.0564, |
| "num_input_tokens_seen": 118271232, |
| "step": 115500 |
| }, |
| { |
| "epoch": 1.070298299517443, |
| "grad_norm": 1.8089003562927246, |
| "learning_rate": 4.4648554635960176e-05, |
| "loss": 0.0664, |
| "num_input_tokens_seen": 118783232, |
| "step": 116000 |
| }, |
| { |
| "epoch": 1.0749116542567425, |
| "grad_norm": 2.216608762741089, |
| "learning_rate": 4.462548786226368e-05, |
| "loss": 0.0599, |
| "num_input_tokens_seen": 119295232, |
| "step": 116500 |
| }, |
| { |
| "epoch": 1.0795250089960418, |
| "grad_norm": 2.6362509727478027, |
| "learning_rate": 4.4602421088567184e-05, |
| "loss": 0.0585, |
| "num_input_tokens_seen": 119807232, |
| "step": 117000 |
| }, |
| { |
| "epoch": 1.084138363735341, |
| "grad_norm": 0.8326151371002197, |
| "learning_rate": 4.457935431487069e-05, |
| "loss": 0.0593, |
| "num_input_tokens_seen": 120319232, |
| "step": 117500 |
| }, |
| { |
| "epoch": 1.0887517184746405, |
| "grad_norm": 1.3363105058670044, |
| "learning_rate": 4.455628754117419e-05, |
| "loss": 0.056, |
| "num_input_tokens_seen": 120831232, |
| "step": 118000 |
| }, |
| { |
| "epoch": 1.0933650732139397, |
| "grad_norm": 2.2342283725738525, |
| "learning_rate": 4.45332207674777e-05, |
| "loss": 0.0607, |
| "num_input_tokens_seen": 121343232, |
| "step": 118500 |
| }, |
| { |
| "epoch": 1.097978427953239, |
| "grad_norm": 1.9718506336212158, |
| "learning_rate": 4.45101539937812e-05, |
| "loss": 0.0625, |
| "num_input_tokens_seen": 121855232, |
| "step": 119000 |
| }, |
| { |
| "epoch": 1.1025917826925384, |
| "grad_norm": 0.7142735123634338, |
| "learning_rate": 4.448708722008471e-05, |
| "loss": 0.0565, |
| "num_input_tokens_seen": 122367232, |
| "step": 119500 |
| }, |
| { |
| "epoch": 1.1072051374318377, |
| "grad_norm": 1.1628931760787964, |
| "learning_rate": 4.446402044638821e-05, |
| "loss": 0.0583, |
| "num_input_tokens_seen": 122879232, |
| "step": 120000 |
| }, |
| { |
| "epoch": 1.111818492171137, |
| "grad_norm": 1.8776410818099976, |
| "learning_rate": 4.444095367269171e-05, |
| "loss": 0.0642, |
| "num_input_tokens_seen": 123391232, |
| "step": 120500 |
| }, |
| { |
| "epoch": 1.1164318469104364, |
| "grad_norm": 1.5755925178527832, |
| "learning_rate": 4.4417886898995216e-05, |
| "loss": 0.0631, |
| "num_input_tokens_seen": 123903232, |
| "step": 121000 |
| }, |
| { |
| "epoch": 1.1210452016497356, |
| "grad_norm": 1.7925944328308105, |
| "learning_rate": 4.4394820125298716e-05, |
| "loss": 0.0603, |
| "num_input_tokens_seen": 124415232, |
| "step": 121500 |
| }, |
| { |
| "epoch": 1.125658556389035, |
| "grad_norm": 2.4041876792907715, |
| "learning_rate": 4.437175335160222e-05, |
| "loss": 0.0552, |
| "num_input_tokens_seen": 124927232, |
| "step": 122000 |
| }, |
| { |
| "epoch": 1.1302719111283344, |
| "grad_norm": 2.1456570625305176, |
| "learning_rate": 4.4348686577905724e-05, |
| "loss": 0.065, |
| "num_input_tokens_seen": 125439232, |
| "step": 122500 |
| }, |
| { |
| "epoch": 1.1348852658676336, |
| "grad_norm": 1.278905987739563, |
| "learning_rate": 4.432561980420923e-05, |
| "loss": 0.0648, |
| "num_input_tokens_seen": 125951232, |
| "step": 123000 |
| }, |
| { |
| "epoch": 1.1394986206069329, |
| "grad_norm": 1.4145876169204712, |
| "learning_rate": 4.4302553030512725e-05, |
| "loss": 0.0603, |
| "num_input_tokens_seen": 126463232, |
| "step": 123500 |
| }, |
| { |
| "epoch": 1.1441119753462323, |
| "grad_norm": 1.247292160987854, |
| "learning_rate": 4.427948625681623e-05, |
| "loss": 0.0616, |
| "num_input_tokens_seen": 126975232, |
| "step": 124000 |
| }, |
| { |
| "epoch": 1.1487253300855316, |
| "grad_norm": 1.0648530721664429, |
| "learning_rate": 4.425641948311974e-05, |
| "loss": 0.0577, |
| "num_input_tokens_seen": 127487232, |
| "step": 124500 |
| }, |
| { |
| "epoch": 1.1533386848248308, |
| "grad_norm": 2.285616874694824, |
| "learning_rate": 4.423335270942324e-05, |
| "loss": 0.0574, |
| "num_input_tokens_seen": 127999232, |
| "step": 125000 |
| }, |
| { |
| "epoch": 1.1579520395641303, |
| "grad_norm": 1.124847173690796, |
| "learning_rate": 4.421028593572674e-05, |
| "loss": 0.0599, |
| "num_input_tokens_seen": 128511232, |
| "step": 125500 |
| }, |
| { |
| "epoch": 1.1625653943034295, |
| "grad_norm": 2.4443585872650146, |
| "learning_rate": 4.418721916203025e-05, |
| "loss": 0.0568, |
| "num_input_tokens_seen": 129023232, |
| "step": 126000 |
| }, |
| { |
| "epoch": 1.167178749042729, |
| "grad_norm": 0.8579834699630737, |
| "learning_rate": 4.416415238833375e-05, |
| "loss": 0.0628, |
| "num_input_tokens_seen": 129535232, |
| "step": 126500 |
| }, |
| { |
| "epoch": 1.1717921037820282, |
| "grad_norm": 3.7771518230438232, |
| "learning_rate": 4.4141085614637256e-05, |
| "loss": 0.0618, |
| "num_input_tokens_seen": 130047232, |
| "step": 127000 |
| }, |
| { |
| "epoch": 1.1764054585213275, |
| "grad_norm": 1.2302302122116089, |
| "learning_rate": 4.411801884094076e-05, |
| "loss": 0.0569, |
| "num_input_tokens_seen": 130559232, |
| "step": 127500 |
| }, |
| { |
| "epoch": 1.1810188132606267, |
| "grad_norm": 5.366886615753174, |
| "learning_rate": 4.409495206724426e-05, |
| "loss": 0.0581, |
| "num_input_tokens_seen": 131071232, |
| "step": 128000 |
| }, |
| { |
| "epoch": 1.1856321679999262, |
| "grad_norm": 1.6237967014312744, |
| "learning_rate": 4.4071885293547765e-05, |
| "loss": 0.0564, |
| "num_input_tokens_seen": 131583232, |
| "step": 128500 |
| }, |
| { |
| "epoch": 1.1902455227392255, |
| "grad_norm": 1.025489091873169, |
| "learning_rate": 4.4048818519851265e-05, |
| "loss": 0.062, |
| "num_input_tokens_seen": 132095232, |
| "step": 129000 |
| }, |
| { |
| "epoch": 1.194858877478525, |
| "grad_norm": 3.0035746097564697, |
| "learning_rate": 4.402575174615477e-05, |
| "loss": 0.0567, |
| "num_input_tokens_seen": 132607232, |
| "step": 129500 |
| }, |
| { |
| "epoch": 1.1994722322178242, |
| "grad_norm": 0.4716099202632904, |
| "learning_rate": 4.4002684972458273e-05, |
| "loss": 0.0594, |
| "num_input_tokens_seen": 133119232, |
| "step": 130000 |
| }, |
| { |
| "epoch": 1.2040855869571234, |
| "grad_norm": 1.073433756828308, |
| "learning_rate": 4.397961819876178e-05, |
| "loss": 0.0638, |
| "num_input_tokens_seen": 133631232, |
| "step": 130500 |
| }, |
| { |
| "epoch": 1.208698941696423, |
| "grad_norm": 1.676879644393921, |
| "learning_rate": 4.395655142506528e-05, |
| "loss": 0.0665, |
| "num_input_tokens_seen": 134143232, |
| "step": 131000 |
| }, |
| { |
| "epoch": 1.2133122964357221, |
| "grad_norm": 1.4313554763793945, |
| "learning_rate": 4.393348465136878e-05, |
| "loss": 0.062, |
| "num_input_tokens_seen": 134655232, |
| "step": 131500 |
| }, |
| { |
| "epoch": 1.2179256511750214, |
| "grad_norm": 1.8880019187927246, |
| "learning_rate": 4.391041787767229e-05, |
| "loss": 0.0568, |
| "num_input_tokens_seen": 135167232, |
| "step": 132000 |
| }, |
| { |
| "epoch": 1.2225390059143209, |
| "grad_norm": 1.572786569595337, |
| "learning_rate": 4.38873511039758e-05, |
| "loss": 0.0581, |
| "num_input_tokens_seen": 135679232, |
| "step": 132500 |
| }, |
| { |
| "epoch": 1.22715236065362, |
| "grad_norm": 1.1069833040237427, |
| "learning_rate": 4.386428433027929e-05, |
| "loss": 0.0567, |
| "num_input_tokens_seen": 136191232, |
| "step": 133000 |
| }, |
| { |
| "epoch": 1.2317657153929193, |
| "grad_norm": 1.1832222938537598, |
| "learning_rate": 4.38412175565828e-05, |
| "loss": 0.0589, |
| "num_input_tokens_seen": 136703232, |
| "step": 133500 |
| }, |
| { |
| "epoch": 1.2363790701322188, |
| "grad_norm": 0.8395095467567444, |
| "learning_rate": 4.3818150782886305e-05, |
| "loss": 0.0607, |
| "num_input_tokens_seen": 137215232, |
| "step": 134000 |
| }, |
| { |
| "epoch": 1.240992424871518, |
| "grad_norm": 1.2240726947784424, |
| "learning_rate": 4.3795084009189806e-05, |
| "loss": 0.0639, |
| "num_input_tokens_seen": 137727232, |
| "step": 134500 |
| }, |
| { |
| "epoch": 1.2456057796108173, |
| "grad_norm": 0.596113383769989, |
| "learning_rate": 4.3772017235493306e-05, |
| "loss": 0.0622, |
| "num_input_tokens_seen": 138239232, |
| "step": 135000 |
| }, |
| { |
| "epoch": 1.2502191343501168, |
| "grad_norm": 1.9236828088760376, |
| "learning_rate": 4.3748950461796814e-05, |
| "loss": 0.0607, |
| "num_input_tokens_seen": 138751232, |
| "step": 135500 |
| }, |
| { |
| "epoch": 1.254832489089416, |
| "grad_norm": 0.9456164836883545, |
| "learning_rate": 4.3725883688100314e-05, |
| "loss": 0.0583, |
| "num_input_tokens_seen": 139263232, |
| "step": 136000 |
| }, |
| { |
| "epoch": 1.2594458438287153, |
| "grad_norm": 3.4136688709259033, |
| "learning_rate": 4.3702816914403815e-05, |
| "loss": 0.0638, |
| "num_input_tokens_seen": 139775232, |
| "step": 136500 |
| }, |
| { |
| "epoch": 1.2640591985680147, |
| "grad_norm": 1.01094388961792, |
| "learning_rate": 4.367975014070732e-05, |
| "loss": 0.0598, |
| "num_input_tokens_seen": 140287232, |
| "step": 137000 |
| }, |
| { |
| "epoch": 1.268672553307314, |
| "grad_norm": 1.1260863542556763, |
| "learning_rate": 4.365668336701082e-05, |
| "loss": 0.0586, |
| "num_input_tokens_seen": 140799232, |
| "step": 137500 |
| }, |
| { |
| "epoch": 1.2732859080466135, |
| "grad_norm": 3.8169174194335938, |
| "learning_rate": 4.363361659331433e-05, |
| "loss": 0.0616, |
| "num_input_tokens_seen": 141311232, |
| "step": 138000 |
| }, |
| { |
| "epoch": 1.2778992627859127, |
| "grad_norm": 0.5968789458274841, |
| "learning_rate": 4.361054981961783e-05, |
| "loss": 0.0586, |
| "num_input_tokens_seen": 141823232, |
| "step": 138500 |
| }, |
| { |
| "epoch": 1.282512617525212, |
| "grad_norm": 1.5847851037979126, |
| "learning_rate": 4.358748304592133e-05, |
| "loss": 0.0531, |
| "num_input_tokens_seen": 142335232, |
| "step": 139000 |
| }, |
| { |
| "epoch": 1.2871259722645112, |
| "grad_norm": 1.6152338981628418, |
| "learning_rate": 4.356441627222484e-05, |
| "loss": 0.0621, |
| "num_input_tokens_seen": 142847232, |
| "step": 139500 |
| }, |
| { |
| "epoch": 1.2917393270038107, |
| "grad_norm": 1.3131306171417236, |
| "learning_rate": 4.3541349498528346e-05, |
| "loss": 0.0596, |
| "num_input_tokens_seen": 143359232, |
| "step": 140000 |
| }, |
| { |
| "epoch": 1.29635268174311, |
| "grad_norm": 1.424111247062683, |
| "learning_rate": 4.351828272483185e-05, |
| "loss": 0.0606, |
| "num_input_tokens_seen": 143871232, |
| "step": 140500 |
| }, |
| { |
| "epoch": 1.3009660364824094, |
| "grad_norm": 0.8023368716239929, |
| "learning_rate": 4.349521595113535e-05, |
| "loss": 0.0644, |
| "num_input_tokens_seen": 144383232, |
| "step": 141000 |
| }, |
| { |
| "epoch": 1.3055793912217086, |
| "grad_norm": 1.9093987941741943, |
| "learning_rate": 4.3472149177438855e-05, |
| "loss": 0.063, |
| "num_input_tokens_seen": 144895232, |
| "step": 141500 |
| }, |
| { |
| "epoch": 1.3101927459610079, |
| "grad_norm": 2.1738569736480713, |
| "learning_rate": 4.3449082403742355e-05, |
| "loss": 0.0627, |
| "num_input_tokens_seen": 145407232, |
| "step": 142000 |
| }, |
| { |
| "epoch": 1.3148061007003071, |
| "grad_norm": 2.2907350063323975, |
| "learning_rate": 4.3426015630045856e-05, |
| "loss": 0.0628, |
| "num_input_tokens_seen": 145919232, |
| "step": 142500 |
| }, |
| { |
| "epoch": 1.3194194554396066, |
| "grad_norm": 1.2344714403152466, |
| "learning_rate": 4.340294885634936e-05, |
| "loss": 0.0589, |
| "num_input_tokens_seen": 146431232, |
| "step": 143000 |
| }, |
| { |
| "epoch": 1.3240328101789058, |
| "grad_norm": 2.3011679649353027, |
| "learning_rate": 4.337988208265287e-05, |
| "loss": 0.0639, |
| "num_input_tokens_seen": 146943232, |
| "step": 143500 |
| }, |
| { |
| "epoch": 1.3286461649182053, |
| "grad_norm": 1.3081352710723877, |
| "learning_rate": 4.3356815308956364e-05, |
| "loss": 0.0607, |
| "num_input_tokens_seen": 147455232, |
| "step": 144000 |
| }, |
| { |
| "epoch": 1.3332595196575046, |
| "grad_norm": 1.5605255365371704, |
| "learning_rate": 4.333374853525987e-05, |
| "loss": 0.0619, |
| "num_input_tokens_seen": 147967232, |
| "step": 144500 |
| }, |
| { |
| "epoch": 1.3378728743968038, |
| "grad_norm": 1.3698718547821045, |
| "learning_rate": 4.331068176156338e-05, |
| "loss": 0.0592, |
| "num_input_tokens_seen": 148479232, |
| "step": 145000 |
| }, |
| { |
| "epoch": 1.3424862291361033, |
| "grad_norm": 0.7845633029937744, |
| "learning_rate": 4.328761498786688e-05, |
| "loss": 0.0649, |
| "num_input_tokens_seen": 148991232, |
| "step": 145500 |
| }, |
| { |
| "epoch": 1.3470995838754025, |
| "grad_norm": 2.0420374870300293, |
| "learning_rate": 4.326454821417038e-05, |
| "loss": 0.0598, |
| "num_input_tokens_seen": 149503232, |
| "step": 146000 |
| }, |
| { |
| "epoch": 1.3517129386147018, |
| "grad_norm": 2.2831552028656006, |
| "learning_rate": 4.324148144047389e-05, |
| "loss": 0.0614, |
| "num_input_tokens_seen": 150015232, |
| "step": 146500 |
| }, |
| { |
| "epoch": 1.3563262933540012, |
| "grad_norm": 0.9809445738792419, |
| "learning_rate": 4.321841466677739e-05, |
| "loss": 0.0588, |
| "num_input_tokens_seen": 150527232, |
| "step": 147000 |
| }, |
| { |
| "epoch": 1.3609396480933005, |
| "grad_norm": 1.6517871618270874, |
| "learning_rate": 4.3195347893080895e-05, |
| "loss": 0.061, |
| "num_input_tokens_seen": 151039232, |
| "step": 147500 |
| }, |
| { |
| "epoch": 1.3655530028325997, |
| "grad_norm": 0.8756200075149536, |
| "learning_rate": 4.3172281119384396e-05, |
| "loss": 0.0601, |
| "num_input_tokens_seen": 151551232, |
| "step": 148000 |
| }, |
| { |
| "epoch": 1.3701663575718992, |
| "grad_norm": 4.2246317863464355, |
| "learning_rate": 4.31492143456879e-05, |
| "loss": 0.0559, |
| "num_input_tokens_seen": 152063232, |
| "step": 148500 |
| }, |
| { |
| "epoch": 1.3747797123111984, |
| "grad_norm": 3.220839738845825, |
| "learning_rate": 4.3126147571991404e-05, |
| "loss": 0.0572, |
| "num_input_tokens_seen": 152575232, |
| "step": 149000 |
| }, |
| { |
| "epoch": 1.379393067050498, |
| "grad_norm": 1.6114301681518555, |
| "learning_rate": 4.3103080798294905e-05, |
| "loss": 0.0593, |
| "num_input_tokens_seen": 153087232, |
| "step": 149500 |
| }, |
| { |
| "epoch": 1.3840064217897972, |
| "grad_norm": 0.6551116108894348, |
| "learning_rate": 4.3080014024598405e-05, |
| "loss": 0.0626, |
| "num_input_tokens_seen": 153599232, |
| "step": 150000 |
| }, |
| { |
| "epoch": 1.3886197765290964, |
| "grad_norm": 2.2895658016204834, |
| "learning_rate": 4.305694725090191e-05, |
| "loss": 0.064, |
| "num_input_tokens_seen": 154111232, |
| "step": 150500 |
| }, |
| { |
| "epoch": 1.3932331312683957, |
| "grad_norm": 2.927482843399048, |
| "learning_rate": 4.303388047720542e-05, |
| "loss": 0.0625, |
| "num_input_tokens_seen": 154623232, |
| "step": 151000 |
| }, |
| { |
| "epoch": 1.3978464860076951, |
| "grad_norm": 1.2749851942062378, |
| "learning_rate": 4.301081370350892e-05, |
| "loss": 0.0579, |
| "num_input_tokens_seen": 155135232, |
| "step": 151500 |
| }, |
| { |
| "epoch": 1.4024598407469944, |
| "grad_norm": 1.7866413593292236, |
| "learning_rate": 4.298774692981242e-05, |
| "loss": 0.0574, |
| "num_input_tokens_seen": 155647232, |
| "step": 152000 |
| }, |
| { |
| "epoch": 1.4070731954862938, |
| "grad_norm": 2.288804292678833, |
| "learning_rate": 4.296468015611593e-05, |
| "loss": 0.0631, |
| "num_input_tokens_seen": 156159232, |
| "step": 152500 |
| }, |
| { |
| "epoch": 1.411686550225593, |
| "grad_norm": 1.509840965270996, |
| "learning_rate": 4.294161338241943e-05, |
| "loss": 0.0585, |
| "num_input_tokens_seen": 156671232, |
| "step": 153000 |
| }, |
| { |
| "epoch": 1.4162999049648923, |
| "grad_norm": 0.8478446006774902, |
| "learning_rate": 4.291854660872293e-05, |
| "loss": 0.0593, |
| "num_input_tokens_seen": 157183232, |
| "step": 153500 |
| }, |
| { |
| "epoch": 1.4209132597041916, |
| "grad_norm": 1.4515230655670166, |
| "learning_rate": 4.289547983502644e-05, |
| "loss": 0.0599, |
| "num_input_tokens_seen": 157695232, |
| "step": 154000 |
| }, |
| { |
| "epoch": 1.425526614443491, |
| "grad_norm": 0.7513217926025391, |
| "learning_rate": 4.2872413061329944e-05, |
| "loss": 0.0602, |
| "num_input_tokens_seen": 158207232, |
| "step": 154500 |
| }, |
| { |
| "epoch": 1.4301399691827903, |
| "grad_norm": 2.4477181434631348, |
| "learning_rate": 4.284934628763344e-05, |
| "loss": 0.0583, |
| "num_input_tokens_seen": 158719232, |
| "step": 155000 |
| }, |
| { |
| "epoch": 1.4347533239220898, |
| "grad_norm": 1.2855825424194336, |
| "learning_rate": 4.2826279513936945e-05, |
| "loss": 0.0653, |
| "num_input_tokens_seen": 159231232, |
| "step": 155500 |
| }, |
| { |
| "epoch": 1.439366678661389, |
| "grad_norm": 0.5422343611717224, |
| "learning_rate": 4.280321274024045e-05, |
| "loss": 0.0601, |
| "num_input_tokens_seen": 159743232, |
| "step": 156000 |
| }, |
| { |
| "epoch": 1.4439800334006883, |
| "grad_norm": 1.519142746925354, |
| "learning_rate": 4.278014596654395e-05, |
| "loss": 0.0558, |
| "num_input_tokens_seen": 160255232, |
| "step": 156500 |
| }, |
| { |
| "epoch": 1.4485933881399875, |
| "grad_norm": 1.936989426612854, |
| "learning_rate": 4.2757079192847454e-05, |
| "loss": 0.0572, |
| "num_input_tokens_seen": 160767232, |
| "step": 157000 |
| }, |
| { |
| "epoch": 1.453206742879287, |
| "grad_norm": 2.0965301990509033, |
| "learning_rate": 4.273401241915096e-05, |
| "loss": 0.0655, |
| "num_input_tokens_seen": 161279232, |
| "step": 157500 |
| }, |
| { |
| "epoch": 1.4578200976185862, |
| "grad_norm": 1.300350308418274, |
| "learning_rate": 4.271094564545446e-05, |
| "loss": 0.0606, |
| "num_input_tokens_seen": 161791232, |
| "step": 158000 |
| }, |
| { |
| "epoch": 1.4624334523578857, |
| "grad_norm": 2.8612143993377686, |
| "learning_rate": 4.268787887175797e-05, |
| "loss": 0.0587, |
| "num_input_tokens_seen": 162303232, |
| "step": 158500 |
| }, |
| { |
| "epoch": 1.467046807097185, |
| "grad_norm": 1.869927167892456, |
| "learning_rate": 4.266481209806147e-05, |
| "loss": 0.0626, |
| "num_input_tokens_seen": 162815232, |
| "step": 159000 |
| }, |
| { |
| "epoch": 1.4716601618364842, |
| "grad_norm": 0.6784268617630005, |
| "learning_rate": 4.264174532436497e-05, |
| "loss": 0.0587, |
| "num_input_tokens_seen": 163327232, |
| "step": 159500 |
| }, |
| { |
| "epoch": 1.4762735165757837, |
| "grad_norm": 1.315468192100525, |
| "learning_rate": 4.261867855066848e-05, |
| "loss": 0.0558, |
| "num_input_tokens_seen": 163839232, |
| "step": 160000 |
| }, |
| { |
| "epoch": 1.480886871315083, |
| "grad_norm": 0.5266712307929993, |
| "learning_rate": 4.2595611776971985e-05, |
| "loss": 0.0601, |
| "num_input_tokens_seen": 164351232, |
| "step": 160500 |
| }, |
| { |
| "epoch": 1.4855002260543821, |
| "grad_norm": 0.976466178894043, |
| "learning_rate": 4.2572545003275486e-05, |
| "loss": 0.059, |
| "num_input_tokens_seen": 164863232, |
| "step": 161000 |
| }, |
| { |
| "epoch": 1.4901135807936816, |
| "grad_norm": 2.195340633392334, |
| "learning_rate": 4.2549478229578986e-05, |
| "loss": 0.0618, |
| "num_input_tokens_seen": 165375232, |
| "step": 161500 |
| }, |
| { |
| "epoch": 1.4947269355329809, |
| "grad_norm": 0.6188003420829773, |
| "learning_rate": 4.2526411455882494e-05, |
| "loss": 0.062, |
| "num_input_tokens_seen": 165887232, |
| "step": 162000 |
| }, |
| { |
| "epoch": 1.49934029027228, |
| "grad_norm": 1.496407389640808, |
| "learning_rate": 4.2503344682185994e-05, |
| "loss": 0.0591, |
| "num_input_tokens_seen": 166399232, |
| "step": 162500 |
| }, |
| { |
| "epoch": 1.5039536450115794, |
| "grad_norm": 0.94919753074646, |
| "learning_rate": 4.2480277908489495e-05, |
| "loss": 0.06, |
| "num_input_tokens_seen": 166911232, |
| "step": 163000 |
| }, |
| { |
| "epoch": 1.5085669997508788, |
| "grad_norm": 1.6207939386367798, |
| "learning_rate": 4.2457211134793e-05, |
| "loss": 0.0599, |
| "num_input_tokens_seen": 167423232, |
| "step": 163500 |
| }, |
| { |
| "epoch": 1.5131803544901783, |
| "grad_norm": 1.1205254793167114, |
| "learning_rate": 4.24341443610965e-05, |
| "loss": 0.0617, |
| "num_input_tokens_seen": 167935232, |
| "step": 164000 |
| }, |
| { |
| "epoch": 1.5177937092294775, |
| "grad_norm": 1.0323721170425415, |
| "learning_rate": 4.24110775874e-05, |
| "loss": 0.0601, |
| "num_input_tokens_seen": 168447232, |
| "step": 164500 |
| }, |
| { |
| "epoch": 1.5224070639687768, |
| "grad_norm": 0.6799350380897522, |
| "learning_rate": 4.238801081370351e-05, |
| "loss": 0.0631, |
| "num_input_tokens_seen": 168959232, |
| "step": 165000 |
| }, |
| { |
| "epoch": 1.527020418708076, |
| "grad_norm": 1.2749136686325073, |
| "learning_rate": 4.236494404000702e-05, |
| "loss": 0.058, |
| "num_input_tokens_seen": 169471232, |
| "step": 165500 |
| }, |
| { |
| "epoch": 1.5316337734473755, |
| "grad_norm": 2.35078763961792, |
| "learning_rate": 4.234187726631052e-05, |
| "loss": 0.066, |
| "num_input_tokens_seen": 169983232, |
| "step": 166000 |
| }, |
| { |
| "epoch": 1.5362471281866747, |
| "grad_norm": 1.8924311399459839, |
| "learning_rate": 4.231881049261402e-05, |
| "loss": 0.0591, |
| "num_input_tokens_seen": 170495232, |
| "step": 166500 |
| }, |
| { |
| "epoch": 1.5408604829259742, |
| "grad_norm": 2.8488757610321045, |
| "learning_rate": 4.2295743718917527e-05, |
| "loss": 0.0584, |
| "num_input_tokens_seen": 171007232, |
| "step": 167000 |
| }, |
| { |
| "epoch": 1.5454738376652735, |
| "grad_norm": 1.7758262157440186, |
| "learning_rate": 4.227267694522103e-05, |
| "loss": 0.0661, |
| "num_input_tokens_seen": 171519232, |
| "step": 167500 |
| }, |
| { |
| "epoch": 1.5500871924045727, |
| "grad_norm": 0.7893622517585754, |
| "learning_rate": 4.224961017152453e-05, |
| "loss": 0.0594, |
| "num_input_tokens_seen": 172031232, |
| "step": 168000 |
| }, |
| { |
| "epoch": 1.554700547143872, |
| "grad_norm": 1.069485068321228, |
| "learning_rate": 4.2226543397828035e-05, |
| "loss": 0.0656, |
| "num_input_tokens_seen": 172543232, |
| "step": 168500 |
| }, |
| { |
| "epoch": 1.5593139018831714, |
| "grad_norm": 2.2371785640716553, |
| "learning_rate": 4.2203476624131536e-05, |
| "loss": 0.058, |
| "num_input_tokens_seen": 173055232, |
| "step": 169000 |
| }, |
| { |
| "epoch": 1.5639272566224707, |
| "grad_norm": 1.76310396194458, |
| "learning_rate": 4.218040985043504e-05, |
| "loss": 0.0623, |
| "num_input_tokens_seen": 173567232, |
| "step": 169500 |
| }, |
| { |
| "epoch": 1.5685406113617701, |
| "grad_norm": 2.7890520095825195, |
| "learning_rate": 4.2157343076738544e-05, |
| "loss": 0.0582, |
| "num_input_tokens_seen": 174079232, |
| "step": 170000 |
| }, |
| { |
| "epoch": 1.5731539661010694, |
| "grad_norm": 2.2342007160186768, |
| "learning_rate": 4.2134276303042044e-05, |
| "loss": 0.0645, |
| "num_input_tokens_seen": 174591232, |
| "step": 170500 |
| }, |
| { |
| "epoch": 1.5777673208403686, |
| "grad_norm": 1.6538183689117432, |
| "learning_rate": 4.211120952934555e-05, |
| "loss": 0.0578, |
| "num_input_tokens_seen": 175103232, |
| "step": 171000 |
| }, |
| { |
| "epoch": 1.5823806755796679, |
| "grad_norm": 6.509249687194824, |
| "learning_rate": 4.208814275564906e-05, |
| "loss": 0.0638, |
| "num_input_tokens_seen": 175615232, |
| "step": 171500 |
| }, |
| { |
| "epoch": 1.5869940303189674, |
| "grad_norm": 2.7748773097991943, |
| "learning_rate": 4.206507598195256e-05, |
| "loss": 0.0646, |
| "num_input_tokens_seen": 176127232, |
| "step": 172000 |
| }, |
| { |
| "epoch": 1.5916073850582668, |
| "grad_norm": 4.16091251373291, |
| "learning_rate": 4.204200920825606e-05, |
| "loss": 0.0653, |
| "num_input_tokens_seen": 176639232, |
| "step": 172500 |
| }, |
| { |
| "epoch": 1.596220739797566, |
| "grad_norm": 1.4821609258651733, |
| "learning_rate": 4.201894243455957e-05, |
| "loss": 0.0642, |
| "num_input_tokens_seen": 177151232, |
| "step": 173000 |
| }, |
| { |
| "epoch": 1.6008340945368653, |
| "grad_norm": 0.9436431527137756, |
| "learning_rate": 4.199587566086307e-05, |
| "loss": 0.0603, |
| "num_input_tokens_seen": 177663232, |
| "step": 173500 |
| }, |
| { |
| "epoch": 1.6054474492761646, |
| "grad_norm": 1.735992193222046, |
| "learning_rate": 4.197280888716657e-05, |
| "loss": 0.0596, |
| "num_input_tokens_seen": 178175232, |
| "step": 174000 |
| }, |
| { |
| "epoch": 1.6100608040154638, |
| "grad_norm": 1.1625646352767944, |
| "learning_rate": 4.1949742113470076e-05, |
| "loss": 0.0601, |
| "num_input_tokens_seen": 178687232, |
| "step": 174500 |
| }, |
| { |
| "epoch": 1.6146741587547633, |
| "grad_norm": 1.0174745321273804, |
| "learning_rate": 4.192667533977358e-05, |
| "loss": 0.058, |
| "num_input_tokens_seen": 179199232, |
| "step": 175000 |
| }, |
| { |
| "epoch": 1.6192875134940627, |
| "grad_norm": 1.141682744026184, |
| "learning_rate": 4.190360856607708e-05, |
| "loss": 0.0622, |
| "num_input_tokens_seen": 179711232, |
| "step": 175500 |
| }, |
| { |
| "epoch": 1.623900868233362, |
| "grad_norm": 1.165004014968872, |
| "learning_rate": 4.1880541792380585e-05, |
| "loss": 0.0627, |
| "num_input_tokens_seen": 180223232, |
| "step": 176000 |
| }, |
| { |
| "epoch": 1.6285142229726612, |
| "grad_norm": 2.1781582832336426, |
| "learning_rate": 4.185747501868409e-05, |
| "loss": 0.0631, |
| "num_input_tokens_seen": 180735232, |
| "step": 176500 |
| }, |
| { |
| "epoch": 1.6331275777119605, |
| "grad_norm": 1.5659372806549072, |
| "learning_rate": 4.183440824498759e-05, |
| "loss": 0.0607, |
| "num_input_tokens_seen": 181247232, |
| "step": 177000 |
| }, |
| { |
| "epoch": 1.63774093245126, |
| "grad_norm": 1.9345473051071167, |
| "learning_rate": 4.181134147129109e-05, |
| "loss": 0.0567, |
| "num_input_tokens_seen": 181759232, |
| "step": 177500 |
| }, |
| { |
| "epoch": 1.6423542871905592, |
| "grad_norm": 0.8415033221244812, |
| "learning_rate": 4.17882746975946e-05, |
| "loss": 0.06, |
| "num_input_tokens_seen": 182271232, |
| "step": 178000 |
| }, |
| { |
| "epoch": 1.6469676419298587, |
| "grad_norm": 0.4496413767337799, |
| "learning_rate": 4.17652079238981e-05, |
| "loss": 0.0583, |
| "num_input_tokens_seen": 182783232, |
| "step": 178500 |
| }, |
| { |
| "epoch": 1.651580996669158, |
| "grad_norm": 1.1432942152023315, |
| "learning_rate": 4.174214115020161e-05, |
| "loss": 0.062, |
| "num_input_tokens_seen": 183295232, |
| "step": 179000 |
| }, |
| { |
| "epoch": 1.6561943514084572, |
| "grad_norm": 0.4867847263813019, |
| "learning_rate": 4.171907437650511e-05, |
| "loss": 0.0653, |
| "num_input_tokens_seen": 183807232, |
| "step": 179500 |
| }, |
| { |
| "epoch": 1.6608077061477564, |
| "grad_norm": 3.039292335510254, |
| "learning_rate": 4.169600760280861e-05, |
| "loss": 0.0578, |
| "num_input_tokens_seen": 184319232, |
| "step": 180000 |
| }, |
| { |
| "epoch": 1.6654210608870559, |
| "grad_norm": 2.18542218208313, |
| "learning_rate": 4.167294082911212e-05, |
| "loss": 0.064, |
| "num_input_tokens_seen": 184831232, |
| "step": 180500 |
| }, |
| { |
| "epoch": 1.6700344156263551, |
| "grad_norm": 0.9734911918640137, |
| "learning_rate": 4.164987405541562e-05, |
| "loss": 0.0578, |
| "num_input_tokens_seen": 185343232, |
| "step": 181000 |
| }, |
| { |
| "epoch": 1.6746477703656546, |
| "grad_norm": 0.8751457929611206, |
| "learning_rate": 4.162680728171912e-05, |
| "loss": 0.0593, |
| "num_input_tokens_seen": 185855232, |
| "step": 181500 |
| }, |
| { |
| "epoch": 1.6792611251049538, |
| "grad_norm": 1.0533229112625122, |
| "learning_rate": 4.1603740508022625e-05, |
| "loss": 0.0601, |
| "num_input_tokens_seen": 186367232, |
| "step": 182000 |
| }, |
| { |
| "epoch": 1.683874479844253, |
| "grad_norm": 0.742938220500946, |
| "learning_rate": 4.158067373432613e-05, |
| "loss": 0.0589, |
| "num_input_tokens_seen": 186879232, |
| "step": 182500 |
| }, |
| { |
| "epoch": 1.6884878345835523, |
| "grad_norm": 1.432569146156311, |
| "learning_rate": 4.155760696062963e-05, |
| "loss": 0.061, |
| "num_input_tokens_seen": 187391232, |
| "step": 183000 |
| }, |
| { |
| "epoch": 1.6931011893228518, |
| "grad_norm": 2.900394916534424, |
| "learning_rate": 4.1534540186933134e-05, |
| "loss": 0.058, |
| "num_input_tokens_seen": 187903232, |
| "step": 183500 |
| }, |
| { |
| "epoch": 1.6977145440621513, |
| "grad_norm": 1.1864616870880127, |
| "learning_rate": 4.151147341323664e-05, |
| "loss": 0.0594, |
| "num_input_tokens_seen": 188415232, |
| "step": 184000 |
| }, |
| { |
| "epoch": 1.7023278988014505, |
| "grad_norm": 2.3834102153778076, |
| "learning_rate": 4.148840663954014e-05, |
| "loss": 0.0623, |
| "num_input_tokens_seen": 188927232, |
| "step": 184500 |
| }, |
| { |
| "epoch": 1.7069412535407498, |
| "grad_norm": 2.183478355407715, |
| "learning_rate": 4.146533986584364e-05, |
| "loss": 0.0621, |
| "num_input_tokens_seen": 189439232, |
| "step": 185000 |
| }, |
| { |
| "epoch": 1.711554608280049, |
| "grad_norm": 1.4946995973587036, |
| "learning_rate": 4.144227309214715e-05, |
| "loss": 0.0585, |
| "num_input_tokens_seen": 189951232, |
| "step": 185500 |
| }, |
| { |
| "epoch": 1.7161679630193483, |
| "grad_norm": 2.6389856338500977, |
| "learning_rate": 4.141920631845066e-05, |
| "loss": 0.0641, |
| "num_input_tokens_seen": 190463232, |
| "step": 186000 |
| }, |
| { |
| "epoch": 1.7207813177586477, |
| "grad_norm": 1.5870720148086548, |
| "learning_rate": 4.139613954475416e-05, |
| "loss": 0.0622, |
| "num_input_tokens_seen": 190975232, |
| "step": 186500 |
| }, |
| { |
| "epoch": 1.7253946724979472, |
| "grad_norm": 1.0115468502044678, |
| "learning_rate": 4.137307277105766e-05, |
| "loss": 0.0602, |
| "num_input_tokens_seen": 191487232, |
| "step": 187000 |
| }, |
| { |
| "epoch": 1.7300080272372464, |
| "grad_norm": 2.0021095275878906, |
| "learning_rate": 4.1350005997361166e-05, |
| "loss": 0.0585, |
| "num_input_tokens_seen": 191999232, |
| "step": 187500 |
| }, |
| { |
| "epoch": 1.7346213819765457, |
| "grad_norm": 1.7288790941238403, |
| "learning_rate": 4.1326939223664666e-05, |
| "loss": 0.064, |
| "num_input_tokens_seen": 192511232, |
| "step": 188000 |
| }, |
| { |
| "epoch": 1.739234736715845, |
| "grad_norm": 2.1877362728118896, |
| "learning_rate": 4.130387244996817e-05, |
| "loss": 0.061, |
| "num_input_tokens_seen": 193023232, |
| "step": 188500 |
| }, |
| { |
| "epoch": 1.7438480914551442, |
| "grad_norm": 2.1723220348358154, |
| "learning_rate": 4.1280805676271674e-05, |
| "loss": 0.0611, |
| "num_input_tokens_seen": 193535232, |
| "step": 189000 |
| }, |
| { |
| "epoch": 1.7484614461944437, |
| "grad_norm": 1.1203595399856567, |
| "learning_rate": 4.1257738902575175e-05, |
| "loss": 0.0587, |
| "num_input_tokens_seen": 194047232, |
| "step": 189500 |
| }, |
| { |
| "epoch": 1.7530748009337431, |
| "grad_norm": 1.7950832843780518, |
| "learning_rate": 4.123467212887868e-05, |
| "loss": 0.0619, |
| "num_input_tokens_seen": 194559232, |
| "step": 190000 |
| }, |
| { |
| "epoch": 1.7576881556730424, |
| "grad_norm": 0.8511695265769958, |
| "learning_rate": 4.121160535518218e-05, |
| "loss": 0.0587, |
| "num_input_tokens_seen": 195071232, |
| "step": 190500 |
| }, |
| { |
| "epoch": 1.7623015104123416, |
| "grad_norm": 0.49872857332229614, |
| "learning_rate": 4.118853858148568e-05, |
| "loss": 0.0586, |
| "num_input_tokens_seen": 195583232, |
| "step": 191000 |
| }, |
| { |
| "epoch": 1.7669148651516409, |
| "grad_norm": 1.272387981414795, |
| "learning_rate": 4.116547180778919e-05, |
| "loss": 0.062, |
| "num_input_tokens_seen": 196095232, |
| "step": 191500 |
| }, |
| { |
| "epoch": 1.7715282198909403, |
| "grad_norm": 3.0328872203826904, |
| "learning_rate": 4.11424050340927e-05, |
| "loss": 0.0561, |
| "num_input_tokens_seen": 196607232, |
| "step": 192000 |
| }, |
| { |
| "epoch": 1.7761415746302396, |
| "grad_norm": 1.1026365756988525, |
| "learning_rate": 4.111933826039619e-05, |
| "loss": 0.061, |
| "num_input_tokens_seen": 197119232, |
| "step": 192500 |
| }, |
| { |
| "epoch": 1.780754929369539, |
| "grad_norm": 1.523284673690796, |
| "learning_rate": 4.10962714866997e-05, |
| "loss": 0.0647, |
| "num_input_tokens_seen": 197631232, |
| "step": 193000 |
| }, |
| { |
| "epoch": 1.7853682841088383, |
| "grad_norm": 2.571349859237671, |
| "learning_rate": 4.1073204713003207e-05, |
| "loss": 0.0572, |
| "num_input_tokens_seen": 198143232, |
| "step": 193500 |
| }, |
| { |
| "epoch": 1.7899816388481375, |
| "grad_norm": 1.1206070184707642, |
| "learning_rate": 4.105013793930671e-05, |
| "loss": 0.065, |
| "num_input_tokens_seen": 198655232, |
| "step": 194000 |
| }, |
| { |
| "epoch": 1.7945949935874368, |
| "grad_norm": 1.2172856330871582, |
| "learning_rate": 4.102707116561021e-05, |
| "loss": 0.0624, |
| "num_input_tokens_seen": 199167232, |
| "step": 194500 |
| }, |
| { |
| "epoch": 1.7992083483267363, |
| "grad_norm": 1.3785135746002197, |
| "learning_rate": 4.1004004391913715e-05, |
| "loss": 0.0619, |
| "num_input_tokens_seen": 199679232, |
| "step": 195000 |
| }, |
| { |
| "epoch": 1.8038217030660355, |
| "grad_norm": 1.8791236877441406, |
| "learning_rate": 4.0980937618217216e-05, |
| "loss": 0.0594, |
| "num_input_tokens_seen": 200191232, |
| "step": 195500 |
| }, |
| { |
| "epoch": 1.808435057805335, |
| "grad_norm": 1.4721789360046387, |
| "learning_rate": 4.0957870844520716e-05, |
| "loss": 0.0584, |
| "num_input_tokens_seen": 200703232, |
| "step": 196000 |
| }, |
| { |
| "epoch": 1.8130484125446342, |
| "grad_norm": 2.4450087547302246, |
| "learning_rate": 4.0934804070824224e-05, |
| "loss": 0.0622, |
| "num_input_tokens_seen": 201215232, |
| "step": 196500 |
| }, |
| { |
| "epoch": 1.8176617672839335, |
| "grad_norm": 2.5776455402374268, |
| "learning_rate": 4.091173729712773e-05, |
| "loss": 0.062, |
| "num_input_tokens_seen": 201727232, |
| "step": 197000 |
| }, |
| { |
| "epoch": 1.8222751220232327, |
| "grad_norm": 0.703079104423523, |
| "learning_rate": 4.088867052343123e-05, |
| "loss": 0.063, |
| "num_input_tokens_seen": 202239232, |
| "step": 197500 |
| }, |
| { |
| "epoch": 1.8268884767625322, |
| "grad_norm": 3.7383570671081543, |
| "learning_rate": 4.086560374973473e-05, |
| "loss": 0.0621, |
| "num_input_tokens_seen": 202751232, |
| "step": 198000 |
| }, |
| { |
| "epoch": 1.8315018315018317, |
| "grad_norm": 1.2119007110595703, |
| "learning_rate": 4.084253697603824e-05, |
| "loss": 0.0638, |
| "num_input_tokens_seen": 203263232, |
| "step": 198500 |
| }, |
| { |
| "epoch": 1.836115186241131, |
| "grad_norm": 1.6069977283477783, |
| "learning_rate": 4.081947020234174e-05, |
| "loss": 0.0594, |
| "num_input_tokens_seen": 203775232, |
| "step": 199000 |
| }, |
| { |
| "epoch": 1.8407285409804302, |
| "grad_norm": 0.5176113843917847, |
| "learning_rate": 4.079640342864525e-05, |
| "loss": 0.0565, |
| "num_input_tokens_seen": 204287232, |
| "step": 199500 |
| }, |
| { |
| "epoch": 1.8453418957197294, |
| "grad_norm": 1.78886878490448, |
| "learning_rate": 4.077333665494875e-05, |
| "loss": 0.0599, |
| "num_input_tokens_seen": 204799232, |
| "step": 200000 |
| }, |
| { |
| "epoch": 1.8499552504590286, |
| "grad_norm": 0.8037757277488708, |
| "learning_rate": 4.075026988125225e-05, |
| "loss": 0.0584, |
| "num_input_tokens_seen": 205311232, |
| "step": 200500 |
| }, |
| { |
| "epoch": 1.8545686051983281, |
| "grad_norm": 0.8422955274581909, |
| "learning_rate": 4.0727203107555756e-05, |
| "loss": 0.0626, |
| "num_input_tokens_seen": 205823232, |
| "step": 201000 |
| }, |
| { |
| "epoch": 1.8591819599376276, |
| "grad_norm": 3.384787082672119, |
| "learning_rate": 4.0704136333859257e-05, |
| "loss": 0.0603, |
| "num_input_tokens_seen": 206335232, |
| "step": 201500 |
| }, |
| { |
| "epoch": 1.8637953146769268, |
| "grad_norm": 1.103167176246643, |
| "learning_rate": 4.068106956016276e-05, |
| "loss": 0.0608, |
| "num_input_tokens_seen": 206847232, |
| "step": 202000 |
| }, |
| { |
| "epoch": 1.868408669416226, |
| "grad_norm": 0.9550286531448364, |
| "learning_rate": 4.0658002786466264e-05, |
| "loss": 0.0583, |
| "num_input_tokens_seen": 207359232, |
| "step": 202500 |
| }, |
| { |
| "epoch": 1.8730220241555253, |
| "grad_norm": 1.2629748582839966, |
| "learning_rate": 4.063493601276977e-05, |
| "loss": 0.0599, |
| "num_input_tokens_seen": 207871232, |
| "step": 203000 |
| }, |
| { |
| "epoch": 1.8776353788948248, |
| "grad_norm": 1.8319883346557617, |
| "learning_rate": 4.061186923907327e-05, |
| "loss": 0.0557, |
| "num_input_tokens_seen": 208383232, |
| "step": 203500 |
| }, |
| { |
| "epoch": 1.882248733634124, |
| "grad_norm": 0.8122320175170898, |
| "learning_rate": 4.058880246537677e-05, |
| "loss": 0.0631, |
| "num_input_tokens_seen": 208895232, |
| "step": 204000 |
| }, |
| { |
| "epoch": 1.8868620883734235, |
| "grad_norm": 1.0240248441696167, |
| "learning_rate": 4.056573569168028e-05, |
| "loss": 0.0571, |
| "num_input_tokens_seen": 209407232, |
| "step": 204500 |
| }, |
| { |
| "epoch": 1.8914754431127228, |
| "grad_norm": 1.0079154968261719, |
| "learning_rate": 4.054266891798378e-05, |
| "loss": 0.0591, |
| "num_input_tokens_seen": 209919232, |
| "step": 205000 |
| }, |
| { |
| "epoch": 1.896088797852022, |
| "grad_norm": 0.7955754399299622, |
| "learning_rate": 4.051960214428728e-05, |
| "loss": 0.0579, |
| "num_input_tokens_seen": 210431232, |
| "step": 205500 |
| }, |
| { |
| "epoch": 1.9007021525913212, |
| "grad_norm": 2.3598215579986572, |
| "learning_rate": 4.049653537059079e-05, |
| "loss": 0.0578, |
| "num_input_tokens_seen": 210943232, |
| "step": 206000 |
| }, |
| { |
| "epoch": 1.9053155073306207, |
| "grad_norm": 2.217241048812866, |
| "learning_rate": 4.047346859689429e-05, |
| "loss": 0.0615, |
| "num_input_tokens_seen": 211455232, |
| "step": 206500 |
| }, |
| { |
| "epoch": 1.90992886206992, |
| "grad_norm": 0.9427639245986938, |
| "learning_rate": 4.045040182319779e-05, |
| "loss": 0.0654, |
| "num_input_tokens_seen": 211967232, |
| "step": 207000 |
| }, |
| { |
| "epoch": 1.9145422168092194, |
| "grad_norm": 2.3182663917541504, |
| "learning_rate": 4.04273350495013e-05, |
| "loss": 0.0605, |
| "num_input_tokens_seen": 212479232, |
| "step": 207500 |
| }, |
| { |
| "epoch": 1.9191555715485187, |
| "grad_norm": 2.283663272857666, |
| "learning_rate": 4.0404268275804805e-05, |
| "loss": 0.059, |
| "num_input_tokens_seen": 212991232, |
| "step": 208000 |
| }, |
| { |
| "epoch": 1.923768926287818, |
| "grad_norm": 0.8118070960044861, |
| "learning_rate": 4.0381201502108305e-05, |
| "loss": 0.0606, |
| "num_input_tokens_seen": 213503232, |
| "step": 208500 |
| }, |
| { |
| "epoch": 1.9283822810271172, |
| "grad_norm": 1.4257065057754517, |
| "learning_rate": 4.0358134728411806e-05, |
| "loss": 0.0619, |
| "num_input_tokens_seen": 214015232, |
| "step": 209000 |
| }, |
| { |
| "epoch": 1.9329956357664166, |
| "grad_norm": 1.2044384479522705, |
| "learning_rate": 4.033506795471531e-05, |
| "loss": 0.0554, |
| "num_input_tokens_seen": 214527232, |
| "step": 209500 |
| }, |
| { |
| "epoch": 1.9376089905057161, |
| "grad_norm": 1.2655075788497925, |
| "learning_rate": 4.0312001181018814e-05, |
| "loss": 0.0569, |
| "num_input_tokens_seen": 215039232, |
| "step": 210000 |
| }, |
| { |
| "epoch": 1.9422223452450154, |
| "grad_norm": 1.7089818716049194, |
| "learning_rate": 4.028893440732232e-05, |
| "loss": 0.062, |
| "num_input_tokens_seen": 215551232, |
| "step": 210500 |
| }, |
| { |
| "epoch": 1.9468356999843146, |
| "grad_norm": 1.0826196670532227, |
| "learning_rate": 4.026586763362582e-05, |
| "loss": 0.0611, |
| "num_input_tokens_seen": 216063232, |
| "step": 211000 |
| }, |
| { |
| "epoch": 1.9514490547236139, |
| "grad_norm": 0.5117043852806091, |
| "learning_rate": 4.024280085992932e-05, |
| "loss": 0.0618, |
| "num_input_tokens_seen": 216575232, |
| "step": 211500 |
| }, |
| { |
| "epoch": 1.956062409462913, |
| "grad_norm": 0.4635091722011566, |
| "learning_rate": 4.021973408623283e-05, |
| "loss": 0.0617, |
| "num_input_tokens_seen": 217087232, |
| "step": 212000 |
| }, |
| { |
| "epoch": 1.9606757642022126, |
| "grad_norm": 2.1524128913879395, |
| "learning_rate": 4.019666731253634e-05, |
| "loss": 0.0614, |
| "num_input_tokens_seen": 217599232, |
| "step": 212500 |
| }, |
| { |
| "epoch": 1.965289118941512, |
| "grad_norm": 1.02557373046875, |
| "learning_rate": 4.017360053883983e-05, |
| "loss": 0.0552, |
| "num_input_tokens_seen": 218111232, |
| "step": 213000 |
| }, |
| { |
| "epoch": 1.9699024736808113, |
| "grad_norm": 2.18851375579834, |
| "learning_rate": 4.015053376514334e-05, |
| "loss": 0.0597, |
| "num_input_tokens_seen": 218623232, |
| "step": 213500 |
| }, |
| { |
| "epoch": 1.9745158284201105, |
| "grad_norm": 2.4914391040802, |
| "learning_rate": 4.0127466991446846e-05, |
| "loss": 0.0616, |
| "num_input_tokens_seen": 219135232, |
| "step": 214000 |
| }, |
| { |
| "epoch": 1.9791291831594098, |
| "grad_norm": 1.8353182077407837, |
| "learning_rate": 4.0104400217750346e-05, |
| "loss": 0.0675, |
| "num_input_tokens_seen": 219647232, |
| "step": 214500 |
| }, |
| { |
| "epoch": 1.983742537898709, |
| "grad_norm": 5.431290149688721, |
| "learning_rate": 4.008133344405385e-05, |
| "loss": 0.0568, |
| "num_input_tokens_seen": 220159232, |
| "step": 215000 |
| }, |
| { |
| "epoch": 1.9883558926380085, |
| "grad_norm": 0.523113489151001, |
| "learning_rate": 4.0058266670357354e-05, |
| "loss": 0.0596, |
| "num_input_tokens_seen": 220671232, |
| "step": 215500 |
| }, |
| { |
| "epoch": 1.992969247377308, |
| "grad_norm": 0.5525696277618408, |
| "learning_rate": 4.0035199896660855e-05, |
| "loss": 0.0589, |
| "num_input_tokens_seen": 221183232, |
| "step": 216000 |
| }, |
| { |
| "epoch": 1.9975826021166072, |
| "grad_norm": 2.0920755863189697, |
| "learning_rate": 4.0012133122964355e-05, |
| "loss": 0.0603, |
| "num_input_tokens_seen": 221695232, |
| "step": 216500 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_combined_score": 0.06747195769945506, |
| "eval_loss": 0.0674719586968422, |
| "eval_mse": 0.06747195670206793, |
| "eval_runtime": 46.4608, |
| "eval_samples_per_second": 2073.535, |
| "eval_steps_per_second": 259.208, |
| "num_input_tokens_seen": 221962752, |
| "step": 216762 |
| }, |
| { |
| "epoch": 2.0021959568559065, |
| "grad_norm": 2.938506841659546, |
| "learning_rate": 3.998906634926786e-05, |
| "loss": 0.0546, |
| "num_input_tokens_seen": 222206464, |
| "step": 217000 |
| }, |
| { |
| "epoch": 2.0068093115952057, |
| "grad_norm": 1.5632978677749634, |
| "learning_rate": 3.996599957557137e-05, |
| "loss": 0.0497, |
| "num_input_tokens_seen": 222718464, |
| "step": 217500 |
| }, |
| { |
| "epoch": 2.011422666334505, |
| "grad_norm": 2.7584619522094727, |
| "learning_rate": 3.994293280187487e-05, |
| "loss": 0.0504, |
| "num_input_tokens_seen": 223230464, |
| "step": 218000 |
| }, |
| { |
| "epoch": 2.0160360210738046, |
| "grad_norm": 0.7712005972862244, |
| "learning_rate": 3.991986602817837e-05, |
| "loss": 0.0498, |
| "num_input_tokens_seen": 223742464, |
| "step": 218500 |
| }, |
| { |
| "epoch": 2.020649375813104, |
| "grad_norm": 2.087860584259033, |
| "learning_rate": 3.989679925448188e-05, |
| "loss": 0.0514, |
| "num_input_tokens_seen": 224254464, |
| "step": 219000 |
| }, |
| { |
| "epoch": 2.025262730552403, |
| "grad_norm": 1.5292513370513916, |
| "learning_rate": 3.987373248078538e-05, |
| "loss": 0.046, |
| "num_input_tokens_seen": 224766464, |
| "step": 219500 |
| }, |
| { |
| "epoch": 2.0298760852917024, |
| "grad_norm": 2.2876648902893066, |
| "learning_rate": 3.985066570708888e-05, |
| "loss": 0.0514, |
| "num_input_tokens_seen": 225278464, |
| "step": 220000 |
| }, |
| { |
| "epoch": 2.0344894400310016, |
| "grad_norm": 1.1318377256393433, |
| "learning_rate": 3.982759893339239e-05, |
| "loss": 0.0466, |
| "num_input_tokens_seen": 225790464, |
| "step": 220500 |
| }, |
| { |
| "epoch": 2.039102794770301, |
| "grad_norm": 0.5960507988929749, |
| "learning_rate": 3.980453215969589e-05, |
| "loss": 0.0484, |
| "num_input_tokens_seen": 226302464, |
| "step": 221000 |
| }, |
| { |
| "epoch": 2.0437161495096006, |
| "grad_norm": 1.8446494340896606, |
| "learning_rate": 3.9781465385999395e-05, |
| "loss": 0.0458, |
| "num_input_tokens_seen": 226814464, |
| "step": 221500 |
| }, |
| { |
| "epoch": 2.0483295042489, |
| "grad_norm": 1.8140873908996582, |
| "learning_rate": 3.9758398612302896e-05, |
| "loss": 0.0496, |
| "num_input_tokens_seen": 227326464, |
| "step": 222000 |
| }, |
| { |
| "epoch": 2.052942858988199, |
| "grad_norm": 0.29578447341918945, |
| "learning_rate": 3.9735331838606396e-05, |
| "loss": 0.0447, |
| "num_input_tokens_seen": 227838464, |
| "step": 222500 |
| }, |
| { |
| "epoch": 2.0575562137274983, |
| "grad_norm": 1.8332575559616089, |
| "learning_rate": 3.9712265064909904e-05, |
| "loss": 0.042, |
| "num_input_tokens_seen": 228350464, |
| "step": 223000 |
| }, |
| { |
| "epoch": 2.0621695684667976, |
| "grad_norm": 1.091813325881958, |
| "learning_rate": 3.968919829121341e-05, |
| "loss": 0.0526, |
| "num_input_tokens_seen": 228862464, |
| "step": 223500 |
| }, |
| { |
| "epoch": 2.066782923206097, |
| "grad_norm": 0.7884387373924255, |
| "learning_rate": 3.9666131517516905e-05, |
| "loss": 0.0455, |
| "num_input_tokens_seen": 229374464, |
| "step": 224000 |
| }, |
| { |
| "epoch": 2.0713962779453965, |
| "grad_norm": 2.7083017826080322, |
| "learning_rate": 3.964306474382041e-05, |
| "loss": 0.0457, |
| "num_input_tokens_seen": 229886464, |
| "step": 224500 |
| }, |
| { |
| "epoch": 2.0760096326846957, |
| "grad_norm": 3.8200302124023438, |
| "learning_rate": 3.961999797012392e-05, |
| "loss": 0.0459, |
| "num_input_tokens_seen": 230398464, |
| "step": 225000 |
| }, |
| { |
| "epoch": 2.080622987423995, |
| "grad_norm": 1.0111039876937866, |
| "learning_rate": 3.959693119642742e-05, |
| "loss": 0.0464, |
| "num_input_tokens_seen": 230910464, |
| "step": 225500 |
| }, |
| { |
| "epoch": 2.0852363421632942, |
| "grad_norm": 0.7892510890960693, |
| "learning_rate": 3.957386442273092e-05, |
| "loss": 0.0527, |
| "num_input_tokens_seen": 231422464, |
| "step": 226000 |
| }, |
| { |
| "epoch": 2.0898496969025935, |
| "grad_norm": 0.9745638370513916, |
| "learning_rate": 3.955079764903443e-05, |
| "loss": 0.0446, |
| "num_input_tokens_seen": 231934464, |
| "step": 226500 |
| }, |
| { |
| "epoch": 2.0944630516418927, |
| "grad_norm": 1.1187430620193481, |
| "learning_rate": 3.952773087533793e-05, |
| "loss": 0.0505, |
| "num_input_tokens_seen": 232446464, |
| "step": 227000 |
| }, |
| { |
| "epoch": 2.0990764063811924, |
| "grad_norm": 1.3649568557739258, |
| "learning_rate": 3.950466410164143e-05, |
| "loss": 0.0494, |
| "num_input_tokens_seen": 232958464, |
| "step": 227500 |
| }, |
| { |
| "epoch": 2.1036897611204917, |
| "grad_norm": 1.2664381265640259, |
| "learning_rate": 3.9481597327944936e-05, |
| "loss": 0.0425, |
| "num_input_tokens_seen": 233470464, |
| "step": 228000 |
| }, |
| { |
| "epoch": 2.108303115859791, |
| "grad_norm": 2.6382997035980225, |
| "learning_rate": 3.9458530554248444e-05, |
| "loss": 0.0469, |
| "num_input_tokens_seen": 233982464, |
| "step": 228500 |
| }, |
| { |
| "epoch": 2.11291647059909, |
| "grad_norm": 1.4181214570999146, |
| "learning_rate": 3.9435463780551944e-05, |
| "loss": 0.0465, |
| "num_input_tokens_seen": 234494464, |
| "step": 229000 |
| }, |
| { |
| "epoch": 2.1175298253383894, |
| "grad_norm": 1.2546645402908325, |
| "learning_rate": 3.9412397006855445e-05, |
| "loss": 0.0502, |
| "num_input_tokens_seen": 235006464, |
| "step": 229500 |
| }, |
| { |
| "epoch": 2.122143180077689, |
| "grad_norm": 3.3777077198028564, |
| "learning_rate": 3.938933023315895e-05, |
| "loss": 0.0513, |
| "num_input_tokens_seen": 235518464, |
| "step": 230000 |
| }, |
| { |
| "epoch": 2.1267565348169883, |
| "grad_norm": 1.0438088178634644, |
| "learning_rate": 3.936626345946245e-05, |
| "loss": 0.0452, |
| "num_input_tokens_seen": 236030464, |
| "step": 230500 |
| }, |
| { |
| "epoch": 2.1313698895562876, |
| "grad_norm": 3.252018928527832, |
| "learning_rate": 3.934319668576596e-05, |
| "loss": 0.0463, |
| "num_input_tokens_seen": 236542464, |
| "step": 231000 |
| }, |
| { |
| "epoch": 2.135983244295587, |
| "grad_norm": 0.6309357285499573, |
| "learning_rate": 3.932012991206946e-05, |
| "loss": 0.0456, |
| "num_input_tokens_seen": 237054464, |
| "step": 231500 |
| }, |
| { |
| "epoch": 2.140596599034886, |
| "grad_norm": 0.6404411196708679, |
| "learning_rate": 3.929706313837296e-05, |
| "loss": 0.0469, |
| "num_input_tokens_seen": 237566464, |
| "step": 232000 |
| }, |
| { |
| "epoch": 2.1452099537741853, |
| "grad_norm": 2.673940896987915, |
| "learning_rate": 3.927399636467647e-05, |
| "loss": 0.0495, |
| "num_input_tokens_seen": 238078464, |
| "step": 232500 |
| }, |
| { |
| "epoch": 2.149823308513485, |
| "grad_norm": 0.5295352935791016, |
| "learning_rate": 3.9250929590979976e-05, |
| "loss": 0.0488, |
| "num_input_tokens_seen": 238590464, |
| "step": 233000 |
| }, |
| { |
| "epoch": 2.1544366632527843, |
| "grad_norm": 2.1107120513916016, |
| "learning_rate": 3.922786281728347e-05, |
| "loss": 0.0471, |
| "num_input_tokens_seen": 239102464, |
| "step": 233500 |
| }, |
| { |
| "epoch": 2.1590500179920835, |
| "grad_norm": 0.7328481674194336, |
| "learning_rate": 3.920479604358698e-05, |
| "loss": 0.0482, |
| "num_input_tokens_seen": 239614464, |
| "step": 234000 |
| }, |
| { |
| "epoch": 2.1636633727313828, |
| "grad_norm": 0.5566291213035583, |
| "learning_rate": 3.9181729269890485e-05, |
| "loss": 0.0449, |
| "num_input_tokens_seen": 240126464, |
| "step": 234500 |
| }, |
| { |
| "epoch": 2.168276727470682, |
| "grad_norm": 2.311140537261963, |
| "learning_rate": 3.915866249619398e-05, |
| "loss": 0.0453, |
| "num_input_tokens_seen": 240638464, |
| "step": 235000 |
| }, |
| { |
| "epoch": 2.1728900822099813, |
| "grad_norm": 0.43719959259033203, |
| "learning_rate": 3.9135595722497486e-05, |
| "loss": 0.0484, |
| "num_input_tokens_seen": 241150464, |
| "step": 235500 |
| }, |
| { |
| "epoch": 2.177503436949281, |
| "grad_norm": 1.3434603214263916, |
| "learning_rate": 3.911252894880099e-05, |
| "loss": 0.0471, |
| "num_input_tokens_seen": 241662464, |
| "step": 236000 |
| }, |
| { |
| "epoch": 2.18211679168858, |
| "grad_norm": 1.4311593770980835, |
| "learning_rate": 3.9089462175104494e-05, |
| "loss": 0.0466, |
| "num_input_tokens_seen": 242174464, |
| "step": 236500 |
| }, |
| { |
| "epoch": 2.1867301464278794, |
| "grad_norm": 1.6135164499282837, |
| "learning_rate": 3.9066395401407994e-05, |
| "loss": 0.0459, |
| "num_input_tokens_seen": 242686464, |
| "step": 237000 |
| }, |
| { |
| "epoch": 2.1913435011671787, |
| "grad_norm": 0.8135620951652527, |
| "learning_rate": 3.90433286277115e-05, |
| "loss": 0.0484, |
| "num_input_tokens_seen": 243198464, |
| "step": 237500 |
| }, |
| { |
| "epoch": 2.195956855906478, |
| "grad_norm": 2.1880440711975098, |
| "learning_rate": 3.9020261854015e-05, |
| "loss": 0.0493, |
| "num_input_tokens_seen": 243710464, |
| "step": 238000 |
| }, |
| { |
| "epoch": 2.200570210645777, |
| "grad_norm": 1.676583170890808, |
| "learning_rate": 3.899719508031851e-05, |
| "loss": 0.0505, |
| "num_input_tokens_seen": 244222464, |
| "step": 238500 |
| }, |
| { |
| "epoch": 2.205183565385077, |
| "grad_norm": 2.2629077434539795, |
| "learning_rate": 3.897412830662201e-05, |
| "loss": 0.0501, |
| "num_input_tokens_seen": 244734464, |
| "step": 239000 |
| }, |
| { |
| "epoch": 2.209796920124376, |
| "grad_norm": 2.8751511573791504, |
| "learning_rate": 3.895106153292552e-05, |
| "loss": 0.0446, |
| "num_input_tokens_seen": 245246464, |
| "step": 239500 |
| }, |
| { |
| "epoch": 2.2144102748636754, |
| "grad_norm": 2.8819162845611572, |
| "learning_rate": 3.892799475922902e-05, |
| "loss": 0.05, |
| "num_input_tokens_seen": 245758464, |
| "step": 240000 |
| }, |
| { |
| "epoch": 2.2190236296029746, |
| "grad_norm": 2.6944236755371094, |
| "learning_rate": 3.890492798553252e-05, |
| "loss": 0.0491, |
| "num_input_tokens_seen": 246270464, |
| "step": 240500 |
| }, |
| { |
| "epoch": 2.223636984342274, |
| "grad_norm": 1.2675094604492188, |
| "learning_rate": 3.8881861211836026e-05, |
| "loss": 0.054, |
| "num_input_tokens_seen": 246782464, |
| "step": 241000 |
| }, |
| { |
| "epoch": 2.2282503390815736, |
| "grad_norm": 3.3482534885406494, |
| "learning_rate": 3.885879443813953e-05, |
| "loss": 0.0436, |
| "num_input_tokens_seen": 247294464, |
| "step": 241500 |
| }, |
| { |
| "epoch": 2.232863693820873, |
| "grad_norm": 4.079286575317383, |
| "learning_rate": 3.8835727664443034e-05, |
| "loss": 0.0451, |
| "num_input_tokens_seen": 247806464, |
| "step": 242000 |
| }, |
| { |
| "epoch": 2.237477048560172, |
| "grad_norm": 1.210747480392456, |
| "learning_rate": 3.8812660890746535e-05, |
| "loss": 0.0474, |
| "num_input_tokens_seen": 248318464, |
| "step": 242500 |
| }, |
| { |
| "epoch": 2.2420904032994713, |
| "grad_norm": 0.7511959671974182, |
| "learning_rate": 3.8789594117050035e-05, |
| "loss": 0.0548, |
| "num_input_tokens_seen": 248830464, |
| "step": 243000 |
| }, |
| { |
| "epoch": 2.2467037580387705, |
| "grad_norm": 2.5810165405273438, |
| "learning_rate": 3.876652734335354e-05, |
| "loss": 0.0501, |
| "num_input_tokens_seen": 249342464, |
| "step": 243500 |
| }, |
| { |
| "epoch": 2.25131711277807, |
| "grad_norm": 1.060328722000122, |
| "learning_rate": 3.874346056965705e-05, |
| "loss": 0.0473, |
| "num_input_tokens_seen": 249854464, |
| "step": 244000 |
| }, |
| { |
| "epoch": 2.255930467517369, |
| "grad_norm": 0.6183954477310181, |
| "learning_rate": 3.8720393795960544e-05, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 250366464, |
| "step": 244500 |
| }, |
| { |
| "epoch": 2.2605438222566687, |
| "grad_norm": 1.4669181108474731, |
| "learning_rate": 3.869732702226405e-05, |
| "loss": 0.046, |
| "num_input_tokens_seen": 250878464, |
| "step": 245000 |
| }, |
| { |
| "epoch": 2.265157176995968, |
| "grad_norm": 0.44876328110694885, |
| "learning_rate": 3.867426024856756e-05, |
| "loss": 0.0494, |
| "num_input_tokens_seen": 251390464, |
| "step": 245500 |
| }, |
| { |
| "epoch": 2.269770531735267, |
| "grad_norm": 1.458533763885498, |
| "learning_rate": 3.865119347487106e-05, |
| "loss": 0.0514, |
| "num_input_tokens_seen": 251902464, |
| "step": 246000 |
| }, |
| { |
| "epoch": 2.2743838864745665, |
| "grad_norm": 1.5308929681777954, |
| "learning_rate": 3.862812670117456e-05, |
| "loss": 0.0482, |
| "num_input_tokens_seen": 252414464, |
| "step": 246500 |
| }, |
| { |
| "epoch": 2.2789972412138657, |
| "grad_norm": 2.227228879928589, |
| "learning_rate": 3.860505992747807e-05, |
| "loss": 0.0489, |
| "num_input_tokens_seen": 252926464, |
| "step": 247000 |
| }, |
| { |
| "epoch": 2.2836105959531654, |
| "grad_norm": 0.44453561305999756, |
| "learning_rate": 3.858199315378157e-05, |
| "loss": 0.0494, |
| "num_input_tokens_seen": 253438464, |
| "step": 247500 |
| }, |
| { |
| "epoch": 2.2882239506924646, |
| "grad_norm": 1.6029125452041626, |
| "learning_rate": 3.855892638008507e-05, |
| "loss": 0.0512, |
| "num_input_tokens_seen": 253950464, |
| "step": 248000 |
| }, |
| { |
| "epoch": 2.292837305431764, |
| "grad_norm": 0.9729604125022888, |
| "learning_rate": 3.8535859606388576e-05, |
| "loss": 0.0479, |
| "num_input_tokens_seen": 254462464, |
| "step": 248500 |
| }, |
| { |
| "epoch": 2.297450660171063, |
| "grad_norm": 2.042520046234131, |
| "learning_rate": 3.8512792832692076e-05, |
| "loss": 0.0505, |
| "num_input_tokens_seen": 254974464, |
| "step": 249000 |
| }, |
| { |
| "epoch": 2.3020640149103624, |
| "grad_norm": 0.6108492016792297, |
| "learning_rate": 3.8489726058995583e-05, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 255486464, |
| "step": 249500 |
| }, |
| { |
| "epoch": 2.3066773696496616, |
| "grad_norm": 3.030125379562378, |
| "learning_rate": 3.8466659285299084e-05, |
| "loss": 0.0489, |
| "num_input_tokens_seen": 255998464, |
| "step": 250000 |
| }, |
| { |
| "epoch": 2.3112907243889613, |
| "grad_norm": 1.440781831741333, |
| "learning_rate": 3.844359251160259e-05, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 256510464, |
| "step": 250500 |
| }, |
| { |
| "epoch": 2.3159040791282606, |
| "grad_norm": 2.0030038356781006, |
| "learning_rate": 3.842052573790609e-05, |
| "loss": 0.051, |
| "num_input_tokens_seen": 257022464, |
| "step": 251000 |
| }, |
| { |
| "epoch": 2.32051743386756, |
| "grad_norm": 0.7390642166137695, |
| "learning_rate": 3.83974589642096e-05, |
| "loss": 0.0524, |
| "num_input_tokens_seen": 257534464, |
| "step": 251500 |
| }, |
| { |
| "epoch": 2.325130788606859, |
| "grad_norm": 1.2793288230895996, |
| "learning_rate": 3.83743921905131e-05, |
| "loss": 0.0511, |
| "num_input_tokens_seen": 258046464, |
| "step": 252000 |
| }, |
| { |
| "epoch": 2.3297441433461583, |
| "grad_norm": 0.9258439540863037, |
| "learning_rate": 3.83513254168166e-05, |
| "loss": 0.0452, |
| "num_input_tokens_seen": 258558464, |
| "step": 252500 |
| }, |
| { |
| "epoch": 2.334357498085458, |
| "grad_norm": 1.6350897550582886, |
| "learning_rate": 3.832825864312011e-05, |
| "loss": 0.0512, |
| "num_input_tokens_seen": 259070464, |
| "step": 253000 |
| }, |
| { |
| "epoch": 2.3389708528247573, |
| "grad_norm": 0.529399037361145, |
| "learning_rate": 3.830519186942361e-05, |
| "loss": 0.0508, |
| "num_input_tokens_seen": 259582464, |
| "step": 253500 |
| }, |
| { |
| "epoch": 2.3435842075640565, |
| "grad_norm": 1.1488155126571655, |
| "learning_rate": 3.828212509572711e-05, |
| "loss": 0.0507, |
| "num_input_tokens_seen": 260094464, |
| "step": 254000 |
| }, |
| { |
| "epoch": 2.3481975623033557, |
| "grad_norm": 1.7055829763412476, |
| "learning_rate": 3.8259058322030616e-05, |
| "loss": 0.0512, |
| "num_input_tokens_seen": 260606464, |
| "step": 254500 |
| }, |
| { |
| "epoch": 2.352810917042655, |
| "grad_norm": 1.6156001091003418, |
| "learning_rate": 3.8235991548334124e-05, |
| "loss": 0.0475, |
| "num_input_tokens_seen": 261118464, |
| "step": 255000 |
| }, |
| { |
| "epoch": 2.3574242717819542, |
| "grad_norm": 1.6147477626800537, |
| "learning_rate": 3.821292477463762e-05, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 261630464, |
| "step": 255500 |
| }, |
| { |
| "epoch": 2.3620376265212535, |
| "grad_norm": 2.267575979232788, |
| "learning_rate": 3.8189858000941125e-05, |
| "loss": 0.0531, |
| "num_input_tokens_seen": 262142464, |
| "step": 256000 |
| }, |
| { |
| "epoch": 2.366650981260553, |
| "grad_norm": 4.673060417175293, |
| "learning_rate": 3.816679122724463e-05, |
| "loss": 0.0482, |
| "num_input_tokens_seen": 262654464, |
| "step": 256500 |
| }, |
| { |
| "epoch": 2.3712643359998524, |
| "grad_norm": 0.9855422377586365, |
| "learning_rate": 3.814372445354813e-05, |
| "loss": 0.0513, |
| "num_input_tokens_seen": 263166464, |
| "step": 257000 |
| }, |
| { |
| "epoch": 2.3758776907391517, |
| "grad_norm": 2.0277483463287354, |
| "learning_rate": 3.8120657679851633e-05, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 263678464, |
| "step": 257500 |
| }, |
| { |
| "epoch": 2.380491045478451, |
| "grad_norm": 2.461817979812622, |
| "learning_rate": 3.809759090615514e-05, |
| "loss": 0.0467, |
| "num_input_tokens_seen": 264190464, |
| "step": 258000 |
| }, |
| { |
| "epoch": 2.38510440021775, |
| "grad_norm": 1.2786630392074585, |
| "learning_rate": 3.807452413245864e-05, |
| "loss": 0.0449, |
| "num_input_tokens_seen": 264702464, |
| "step": 258500 |
| }, |
| { |
| "epoch": 2.38971775495705, |
| "grad_norm": 0.7494092583656311, |
| "learning_rate": 3.805145735876215e-05, |
| "loss": 0.0444, |
| "num_input_tokens_seen": 265214464, |
| "step": 259000 |
| }, |
| { |
| "epoch": 2.394331109696349, |
| "grad_norm": 0.7989722490310669, |
| "learning_rate": 3.802839058506565e-05, |
| "loss": 0.0474, |
| "num_input_tokens_seen": 265726464, |
| "step": 259500 |
| }, |
| { |
| "epoch": 2.3989444644356483, |
| "grad_norm": 1.17472505569458, |
| "learning_rate": 3.800532381136916e-05, |
| "loss": 0.0508, |
| "num_input_tokens_seen": 266238464, |
| "step": 260000 |
| }, |
| { |
| "epoch": 2.4035578191749476, |
| "grad_norm": 4.456437587738037, |
| "learning_rate": 3.798225703767266e-05, |
| "loss": 0.0536, |
| "num_input_tokens_seen": 266750464, |
| "step": 260500 |
| }, |
| { |
| "epoch": 2.408171173914247, |
| "grad_norm": 1.390002727508545, |
| "learning_rate": 3.795919026397616e-05, |
| "loss": 0.0489, |
| "num_input_tokens_seen": 267262464, |
| "step": 261000 |
| }, |
| { |
| "epoch": 2.412784528653546, |
| "grad_norm": 3.4362330436706543, |
| "learning_rate": 3.7936123490279665e-05, |
| "loss": 0.0455, |
| "num_input_tokens_seen": 267774464, |
| "step": 261500 |
| }, |
| { |
| "epoch": 2.417397883392846, |
| "grad_norm": 3.1407535076141357, |
| "learning_rate": 3.7913056716583166e-05, |
| "loss": 0.0488, |
| "num_input_tokens_seen": 268286464, |
| "step": 262000 |
| }, |
| { |
| "epoch": 2.422011238132145, |
| "grad_norm": 5.290740966796875, |
| "learning_rate": 3.788998994288667e-05, |
| "loss": 0.0529, |
| "num_input_tokens_seen": 268798464, |
| "step": 262500 |
| }, |
| { |
| "epoch": 2.4266245928714443, |
| "grad_norm": 0.8178442716598511, |
| "learning_rate": 3.7866923169190174e-05, |
| "loss": 0.0488, |
| "num_input_tokens_seen": 269310464, |
| "step": 263000 |
| }, |
| { |
| "epoch": 2.4312379476107435, |
| "grad_norm": 1.9484672546386719, |
| "learning_rate": 3.7843856395493674e-05, |
| "loss": 0.047, |
| "num_input_tokens_seen": 269822464, |
| "step": 263500 |
| }, |
| { |
| "epoch": 2.4358513023500428, |
| "grad_norm": 3.035595178604126, |
| "learning_rate": 3.782078962179718e-05, |
| "loss": 0.0465, |
| "num_input_tokens_seen": 270334464, |
| "step": 264000 |
| }, |
| { |
| "epoch": 2.4404646570893425, |
| "grad_norm": 1.731019377708435, |
| "learning_rate": 3.779772284810069e-05, |
| "loss": 0.0458, |
| "num_input_tokens_seen": 270846464, |
| "step": 264500 |
| }, |
| { |
| "epoch": 2.4450780118286417, |
| "grad_norm": 1.4459056854248047, |
| "learning_rate": 3.777465607440418e-05, |
| "loss": 0.0469, |
| "num_input_tokens_seen": 271358464, |
| "step": 265000 |
| }, |
| { |
| "epoch": 2.449691366567941, |
| "grad_norm": 1.475520372390747, |
| "learning_rate": 3.775158930070769e-05, |
| "loss": 0.0489, |
| "num_input_tokens_seen": 271870464, |
| "step": 265500 |
| }, |
| { |
| "epoch": 2.45430472130724, |
| "grad_norm": 1.0083856582641602, |
| "learning_rate": 3.77285225270112e-05, |
| "loss": 0.0474, |
| "num_input_tokens_seen": 272382464, |
| "step": 266000 |
| }, |
| { |
| "epoch": 2.4589180760465394, |
| "grad_norm": 1.0660340785980225, |
| "learning_rate": 3.770545575331469e-05, |
| "loss": 0.0531, |
| "num_input_tokens_seen": 272894464, |
| "step": 266500 |
| }, |
| { |
| "epoch": 2.4635314307858387, |
| "grad_norm": 2.4508252143859863, |
| "learning_rate": 3.76823889796182e-05, |
| "loss": 0.0484, |
| "num_input_tokens_seen": 273406464, |
| "step": 267000 |
| }, |
| { |
| "epoch": 2.468144785525138, |
| "grad_norm": 1.2447962760925293, |
| "learning_rate": 3.7659322205921706e-05, |
| "loss": 0.0543, |
| "num_input_tokens_seen": 273918464, |
| "step": 267500 |
| }, |
| { |
| "epoch": 2.4727581402644376, |
| "grad_norm": 0.9269862174987793, |
| "learning_rate": 3.763625543222521e-05, |
| "loss": 0.047, |
| "num_input_tokens_seen": 274430464, |
| "step": 268000 |
| }, |
| { |
| "epoch": 2.477371495003737, |
| "grad_norm": 1.8680906295776367, |
| "learning_rate": 3.761318865852871e-05, |
| "loss": 0.0488, |
| "num_input_tokens_seen": 274942464, |
| "step": 268500 |
| }, |
| { |
| "epoch": 2.481984849743036, |
| "grad_norm": 2.0206573009490967, |
| "learning_rate": 3.7590121884832215e-05, |
| "loss": 0.0481, |
| "num_input_tokens_seen": 275454464, |
| "step": 269000 |
| }, |
| { |
| "epoch": 2.4865982044823354, |
| "grad_norm": 1.7884100675582886, |
| "learning_rate": 3.7567055111135715e-05, |
| "loss": 0.0491, |
| "num_input_tokens_seen": 275966464, |
| "step": 269500 |
| }, |
| { |
| "epoch": 2.4912115592216346, |
| "grad_norm": 0.8701728582382202, |
| "learning_rate": 3.754398833743922e-05, |
| "loss": 0.0482, |
| "num_input_tokens_seen": 276478464, |
| "step": 270000 |
| }, |
| { |
| "epoch": 2.4958249139609343, |
| "grad_norm": 1.0109634399414062, |
| "learning_rate": 3.752092156374272e-05, |
| "loss": 0.0501, |
| "num_input_tokens_seen": 276990464, |
| "step": 270500 |
| }, |
| { |
| "epoch": 2.5004382687002336, |
| "grad_norm": 2.7722220420837402, |
| "learning_rate": 3.749785479004623e-05, |
| "loss": 0.0521, |
| "num_input_tokens_seen": 277502464, |
| "step": 271000 |
| }, |
| { |
| "epoch": 2.505051623439533, |
| "grad_norm": 0.6980007886886597, |
| "learning_rate": 3.747478801634973e-05, |
| "loss": 0.0489, |
| "num_input_tokens_seen": 278014464, |
| "step": 271500 |
| }, |
| { |
| "epoch": 2.509664978178832, |
| "grad_norm": 1.2792749404907227, |
| "learning_rate": 3.745172124265324e-05, |
| "loss": 0.0489, |
| "num_input_tokens_seen": 278526464, |
| "step": 272000 |
| }, |
| { |
| "epoch": 2.5142783329181313, |
| "grad_norm": 2.294569969177246, |
| "learning_rate": 3.742865446895674e-05, |
| "loss": 0.0499, |
| "num_input_tokens_seen": 279038464, |
| "step": 272500 |
| }, |
| { |
| "epoch": 2.5188916876574305, |
| "grad_norm": 0.667633593082428, |
| "learning_rate": 3.740558769526024e-05, |
| "loss": 0.0493, |
| "num_input_tokens_seen": 279550464, |
| "step": 273000 |
| }, |
| { |
| "epoch": 2.52350504239673, |
| "grad_norm": 1.3469390869140625, |
| "learning_rate": 3.738252092156375e-05, |
| "loss": 0.0495, |
| "num_input_tokens_seen": 280062464, |
| "step": 273500 |
| }, |
| { |
| "epoch": 2.5281183971360295, |
| "grad_norm": 1.247475266456604, |
| "learning_rate": 3.735945414786725e-05, |
| "loss": 0.0511, |
| "num_input_tokens_seen": 280574464, |
| "step": 274000 |
| }, |
| { |
| "epoch": 2.5327317518753287, |
| "grad_norm": 0.4033117890357971, |
| "learning_rate": 3.733638737417075e-05, |
| "loss": 0.0535, |
| "num_input_tokens_seen": 281086464, |
| "step": 274500 |
| }, |
| { |
| "epoch": 2.537345106614628, |
| "grad_norm": 1.1649394035339355, |
| "learning_rate": 3.7313320600474255e-05, |
| "loss": 0.0532, |
| "num_input_tokens_seen": 281598464, |
| "step": 275000 |
| }, |
| { |
| "epoch": 2.5419584613539272, |
| "grad_norm": 2.126436710357666, |
| "learning_rate": 3.729025382677776e-05, |
| "loss": 0.0485, |
| "num_input_tokens_seen": 282110464, |
| "step": 275500 |
| }, |
| { |
| "epoch": 2.546571816093227, |
| "grad_norm": 0.8005649447441101, |
| "learning_rate": 3.726718705308126e-05, |
| "loss": 0.0506, |
| "num_input_tokens_seen": 282622464, |
| "step": 276000 |
| }, |
| { |
| "epoch": 2.551185170832526, |
| "grad_norm": 2.3989765644073486, |
| "learning_rate": 3.7244120279384764e-05, |
| "loss": 0.0513, |
| "num_input_tokens_seen": 283134464, |
| "step": 276500 |
| }, |
| { |
| "epoch": 2.5557985255718254, |
| "grad_norm": 0.7040809988975525, |
| "learning_rate": 3.722105350568827e-05, |
| "loss": 0.0496, |
| "num_input_tokens_seen": 283646464, |
| "step": 277000 |
| }, |
| { |
| "epoch": 2.5604118803111247, |
| "grad_norm": 1.1335313320159912, |
| "learning_rate": 3.719798673199177e-05, |
| "loss": 0.0546, |
| "num_input_tokens_seen": 284158464, |
| "step": 277500 |
| }, |
| { |
| "epoch": 2.565025235050424, |
| "grad_norm": 0.9312555193901062, |
| "learning_rate": 3.717491995829527e-05, |
| "loss": 0.0516, |
| "num_input_tokens_seen": 284670464, |
| "step": 278000 |
| }, |
| { |
| "epoch": 2.569638589789723, |
| "grad_norm": 0.7695990800857544, |
| "learning_rate": 3.715185318459878e-05, |
| "loss": 0.0521, |
| "num_input_tokens_seen": 285182464, |
| "step": 278500 |
| }, |
| { |
| "epoch": 2.5742519445290224, |
| "grad_norm": 1.258518934249878, |
| "learning_rate": 3.712878641090228e-05, |
| "loss": 0.0499, |
| "num_input_tokens_seen": 285694464, |
| "step": 279000 |
| }, |
| { |
| "epoch": 2.578865299268322, |
| "grad_norm": 2.346951961517334, |
| "learning_rate": 3.710571963720578e-05, |
| "loss": 0.0472, |
| "num_input_tokens_seen": 286206464, |
| "step": 279500 |
| }, |
| { |
| "epoch": 2.5834786540076213, |
| "grad_norm": 0.8598672747612, |
| "learning_rate": 3.708265286350929e-05, |
| "loss": 0.0526, |
| "num_input_tokens_seen": 286718464, |
| "step": 280000 |
| }, |
| { |
| "epoch": 2.5880920087469206, |
| "grad_norm": 1.0490000247955322, |
| "learning_rate": 3.705958608981279e-05, |
| "loss": 0.0441, |
| "num_input_tokens_seen": 287230464, |
| "step": 280500 |
| }, |
| { |
| "epoch": 2.59270536348622, |
| "grad_norm": 0.49518364667892456, |
| "learning_rate": 3.7036519316116296e-05, |
| "loss": 0.0474, |
| "num_input_tokens_seen": 287742464, |
| "step": 281000 |
| }, |
| { |
| "epoch": 2.597318718225519, |
| "grad_norm": 1.5736312866210938, |
| "learning_rate": 3.70134525424198e-05, |
| "loss": 0.0509, |
| "num_input_tokens_seen": 288254464, |
| "step": 281500 |
| }, |
| { |
| "epoch": 2.6019320729648188, |
| "grad_norm": 2.511143445968628, |
| "learning_rate": 3.6990385768723304e-05, |
| "loss": 0.047, |
| "num_input_tokens_seen": 288766464, |
| "step": 282000 |
| }, |
| { |
| "epoch": 2.606545427704118, |
| "grad_norm": 0.9060021638870239, |
| "learning_rate": 3.6967318995026805e-05, |
| "loss": 0.053, |
| "num_input_tokens_seen": 289278464, |
| "step": 282500 |
| }, |
| { |
| "epoch": 2.6111587824434173, |
| "grad_norm": 1.4283766746520996, |
| "learning_rate": 3.694425222133031e-05, |
| "loss": 0.0476, |
| "num_input_tokens_seen": 289790464, |
| "step": 283000 |
| }, |
| { |
| "epoch": 2.6157721371827165, |
| "grad_norm": 1.5333555936813354, |
| "learning_rate": 3.692118544763381e-05, |
| "loss": 0.0538, |
| "num_input_tokens_seen": 290302464, |
| "step": 283500 |
| }, |
| { |
| "epoch": 2.6203854919220158, |
| "grad_norm": 1.615579605102539, |
| "learning_rate": 3.689811867393731e-05, |
| "loss": 0.0475, |
| "num_input_tokens_seen": 290814464, |
| "step": 284000 |
| }, |
| { |
| "epoch": 2.624998846661315, |
| "grad_norm": 1.5331679582595825, |
| "learning_rate": 3.687505190024082e-05, |
| "loss": 0.0471, |
| "num_input_tokens_seen": 291326464, |
| "step": 284500 |
| }, |
| { |
| "epoch": 2.6296122014006142, |
| "grad_norm": 2.3747360706329346, |
| "learning_rate": 3.685198512654433e-05, |
| "loss": 0.0477, |
| "num_input_tokens_seen": 291838464, |
| "step": 285000 |
| }, |
| { |
| "epoch": 2.634225556139914, |
| "grad_norm": 2.0471205711364746, |
| "learning_rate": 3.682891835284782e-05, |
| "loss": 0.0493, |
| "num_input_tokens_seen": 292350464, |
| "step": 285500 |
| }, |
| { |
| "epoch": 2.638838910879213, |
| "grad_norm": 1.0454156398773193, |
| "learning_rate": 3.680585157915133e-05, |
| "loss": 0.0467, |
| "num_input_tokens_seen": 292862464, |
| "step": 286000 |
| }, |
| { |
| "epoch": 2.6434522656185124, |
| "grad_norm": 2.0174975395202637, |
| "learning_rate": 3.678278480545484e-05, |
| "loss": 0.0526, |
| "num_input_tokens_seen": 293374464, |
| "step": 286500 |
| }, |
| { |
| "epoch": 2.6480656203578117, |
| "grad_norm": 1.8630324602127075, |
| "learning_rate": 3.675971803175833e-05, |
| "loss": 0.0489, |
| "num_input_tokens_seen": 293886464, |
| "step": 287000 |
| }, |
| { |
| "epoch": 2.6526789750971114, |
| "grad_norm": 2.270232915878296, |
| "learning_rate": 3.673665125806184e-05, |
| "loss": 0.0509, |
| "num_input_tokens_seen": 294398464, |
| "step": 287500 |
| }, |
| { |
| "epoch": 2.6572923298364106, |
| "grad_norm": 1.7369494438171387, |
| "learning_rate": 3.6713584484365345e-05, |
| "loss": 0.0504, |
| "num_input_tokens_seen": 294910464, |
| "step": 288000 |
| }, |
| { |
| "epoch": 2.66190568457571, |
| "grad_norm": 0.9229201078414917, |
| "learning_rate": 3.6690517710668846e-05, |
| "loss": 0.0467, |
| "num_input_tokens_seen": 295422464, |
| "step": 288500 |
| }, |
| { |
| "epoch": 2.666519039315009, |
| "grad_norm": 1.377439260482788, |
| "learning_rate": 3.6667450936972346e-05, |
| "loss": 0.0515, |
| "num_input_tokens_seen": 295934464, |
| "step": 289000 |
| }, |
| { |
| "epoch": 2.6711323940543084, |
| "grad_norm": 1.9601995944976807, |
| "learning_rate": 3.6644384163275854e-05, |
| "loss": 0.0527, |
| "num_input_tokens_seen": 296446464, |
| "step": 289500 |
| }, |
| { |
| "epoch": 2.6757457487936076, |
| "grad_norm": 1.4592013359069824, |
| "learning_rate": 3.6621317389579354e-05, |
| "loss": 0.0491, |
| "num_input_tokens_seen": 296958464, |
| "step": 290000 |
| }, |
| { |
| "epoch": 2.680359103532907, |
| "grad_norm": 0.35405218601226807, |
| "learning_rate": 3.659825061588286e-05, |
| "loss": 0.0472, |
| "num_input_tokens_seen": 297470464, |
| "step": 290500 |
| }, |
| { |
| "epoch": 2.6849724582722065, |
| "grad_norm": 1.9252680540084839, |
| "learning_rate": 3.657518384218636e-05, |
| "loss": 0.0469, |
| "num_input_tokens_seen": 297982464, |
| "step": 291000 |
| }, |
| { |
| "epoch": 2.689585813011506, |
| "grad_norm": 1.1235663890838623, |
| "learning_rate": 3.655211706848987e-05, |
| "loss": 0.0485, |
| "num_input_tokens_seen": 298494464, |
| "step": 291500 |
| }, |
| { |
| "epoch": 2.694199167750805, |
| "grad_norm": 0.9481515884399414, |
| "learning_rate": 3.652905029479337e-05, |
| "loss": 0.049, |
| "num_input_tokens_seen": 299006464, |
| "step": 292000 |
| }, |
| { |
| "epoch": 2.6988125224901043, |
| "grad_norm": 0.37934771180152893, |
| "learning_rate": 3.650598352109687e-05, |
| "loss": 0.052, |
| "num_input_tokens_seen": 299518464, |
| "step": 292500 |
| }, |
| { |
| "epoch": 2.7034258772294035, |
| "grad_norm": 1.1855201721191406, |
| "learning_rate": 3.648291674740038e-05, |
| "loss": 0.0492, |
| "num_input_tokens_seen": 300030464, |
| "step": 293000 |
| }, |
| { |
| "epoch": 2.708039231968703, |
| "grad_norm": 1.4538213014602661, |
| "learning_rate": 3.645984997370388e-05, |
| "loss": 0.0503, |
| "num_input_tokens_seen": 300542464, |
| "step": 293500 |
| }, |
| { |
| "epoch": 2.7126525867080025, |
| "grad_norm": 2.1017704010009766, |
| "learning_rate": 3.6436783200007386e-05, |
| "loss": 0.0458, |
| "num_input_tokens_seen": 301054464, |
| "step": 294000 |
| }, |
| { |
| "epoch": 2.7172659414473017, |
| "grad_norm": 0.6946723461151123, |
| "learning_rate": 3.6413716426310887e-05, |
| "loss": 0.0524, |
| "num_input_tokens_seen": 301566464, |
| "step": 294500 |
| }, |
| { |
| "epoch": 2.721879296186601, |
| "grad_norm": 3.0771243572235107, |
| "learning_rate": 3.639064965261439e-05, |
| "loss": 0.0518, |
| "num_input_tokens_seen": 302078464, |
| "step": 295000 |
| }, |
| { |
| "epoch": 2.7264926509259, |
| "grad_norm": 1.259162425994873, |
| "learning_rate": 3.6367582878917895e-05, |
| "loss": 0.0532, |
| "num_input_tokens_seen": 302590464, |
| "step": 295500 |
| }, |
| { |
| "epoch": 2.7311060056651995, |
| "grad_norm": 1.8771902322769165, |
| "learning_rate": 3.63445161052214e-05, |
| "loss": 0.0487, |
| "num_input_tokens_seen": 303102464, |
| "step": 296000 |
| }, |
| { |
| "epoch": 2.7357193604044987, |
| "grad_norm": 1.765956997871399, |
| "learning_rate": 3.6321449331524896e-05, |
| "loss": 0.0437, |
| "num_input_tokens_seen": 303614464, |
| "step": 296500 |
| }, |
| { |
| "epoch": 2.7403327151437984, |
| "grad_norm": 1.2610450983047485, |
| "learning_rate": 3.62983825578284e-05, |
| "loss": 0.044, |
| "num_input_tokens_seen": 304126464, |
| "step": 297000 |
| }, |
| { |
| "epoch": 2.7449460698830976, |
| "grad_norm": 4.452374458312988, |
| "learning_rate": 3.627531578413191e-05, |
| "loss": 0.0507, |
| "num_input_tokens_seen": 304638464, |
| "step": 297500 |
| }, |
| { |
| "epoch": 2.749559424622397, |
| "grad_norm": 1.082930088043213, |
| "learning_rate": 3.625224901043541e-05, |
| "loss": 0.0518, |
| "num_input_tokens_seen": 305150464, |
| "step": 298000 |
| }, |
| { |
| "epoch": 2.754172779361696, |
| "grad_norm": 0.708118200302124, |
| "learning_rate": 3.622918223673891e-05, |
| "loss": 0.0483, |
| "num_input_tokens_seen": 305662464, |
| "step": 298500 |
| }, |
| { |
| "epoch": 2.758786134100996, |
| "grad_norm": 1.1710622310638428, |
| "learning_rate": 3.620611546304242e-05, |
| "loss": 0.051, |
| "num_input_tokens_seen": 306174464, |
| "step": 299000 |
| }, |
| { |
| "epoch": 2.763399488840295, |
| "grad_norm": 2.388134002685547, |
| "learning_rate": 3.618304868934592e-05, |
| "loss": 0.0506, |
| "num_input_tokens_seen": 306686464, |
| "step": 299500 |
| }, |
| { |
| "epoch": 2.7680128435795943, |
| "grad_norm": 2.3141307830810547, |
| "learning_rate": 3.615998191564942e-05, |
| "loss": 0.0464, |
| "num_input_tokens_seen": 307198464, |
| "step": 300000 |
| }, |
| { |
| "epoch": 2.7726261983188936, |
| "grad_norm": 1.966213345527649, |
| "learning_rate": 3.613691514195293e-05, |
| "loss": 0.0501, |
| "num_input_tokens_seen": 307710464, |
| "step": 300500 |
| }, |
| { |
| "epoch": 2.777239553058193, |
| "grad_norm": 3.948702573776245, |
| "learning_rate": 3.611384836825643e-05, |
| "loss": 0.0495, |
| "num_input_tokens_seen": 308222464, |
| "step": 301000 |
| }, |
| { |
| "epoch": 2.781852907797492, |
| "grad_norm": 1.3868130445480347, |
| "learning_rate": 3.6090781594559935e-05, |
| "loss": 0.0471, |
| "num_input_tokens_seen": 308734464, |
| "step": 301500 |
| }, |
| { |
| "epoch": 2.7864662625367913, |
| "grad_norm": 1.42705500125885, |
| "learning_rate": 3.6067714820863436e-05, |
| "loss": 0.0474, |
| "num_input_tokens_seen": 309246464, |
| "step": 302000 |
| }, |
| { |
| "epoch": 2.7910796172760906, |
| "grad_norm": 1.4073491096496582, |
| "learning_rate": 3.604464804716694e-05, |
| "loss": 0.0459, |
| "num_input_tokens_seen": 309758464, |
| "step": 302500 |
| }, |
| { |
| "epoch": 2.7956929720153902, |
| "grad_norm": 1.990958333015442, |
| "learning_rate": 3.6021581273470444e-05, |
| "loss": 0.0461, |
| "num_input_tokens_seen": 310270464, |
| "step": 303000 |
| }, |
| { |
| "epoch": 2.8003063267546895, |
| "grad_norm": 2.2346065044403076, |
| "learning_rate": 3.599851449977395e-05, |
| "loss": 0.0534, |
| "num_input_tokens_seen": 310782464, |
| "step": 303500 |
| }, |
| { |
| "epoch": 2.8049196814939887, |
| "grad_norm": 1.1180897951126099, |
| "learning_rate": 3.597544772607745e-05, |
| "loss": 0.0459, |
| "num_input_tokens_seen": 311294464, |
| "step": 304000 |
| }, |
| { |
| "epoch": 2.809533036233288, |
| "grad_norm": 1.765995979309082, |
| "learning_rate": 3.595238095238095e-05, |
| "loss": 0.0443, |
| "num_input_tokens_seen": 311806464, |
| "step": 304500 |
| }, |
| { |
| "epoch": 2.8141463909725877, |
| "grad_norm": 0.6811426877975464, |
| "learning_rate": 3.592931417868446e-05, |
| "loss": 0.0488, |
| "num_input_tokens_seen": 312318464, |
| "step": 305000 |
| }, |
| { |
| "epoch": 2.818759745711887, |
| "grad_norm": 2.811584234237671, |
| "learning_rate": 3.590624740498796e-05, |
| "loss": 0.0517, |
| "num_input_tokens_seen": 312830464, |
| "step": 305500 |
| }, |
| { |
| "epoch": 2.823373100451186, |
| "grad_norm": 2.9501793384552, |
| "learning_rate": 3.588318063129146e-05, |
| "loss": 0.0537, |
| "num_input_tokens_seen": 313342464, |
| "step": 306000 |
| }, |
| { |
| "epoch": 2.8279864551904854, |
| "grad_norm": 0.9767802357673645, |
| "learning_rate": 3.586011385759497e-05, |
| "loss": 0.0473, |
| "num_input_tokens_seen": 313854464, |
| "step": 306500 |
| }, |
| { |
| "epoch": 2.8325998099297847, |
| "grad_norm": 1.463254451751709, |
| "learning_rate": 3.5837047083898476e-05, |
| "loss": 0.0498, |
| "num_input_tokens_seen": 314366464, |
| "step": 307000 |
| }, |
| { |
| "epoch": 2.837213164669084, |
| "grad_norm": 1.6375666856765747, |
| "learning_rate": 3.581398031020197e-05, |
| "loss": 0.0494, |
| "num_input_tokens_seen": 314878464, |
| "step": 307500 |
| }, |
| { |
| "epoch": 2.841826519408383, |
| "grad_norm": 6.093188285827637, |
| "learning_rate": 3.579091353650548e-05, |
| "loss": 0.0505, |
| "num_input_tokens_seen": 315390464, |
| "step": 308000 |
| }, |
| { |
| "epoch": 2.846439874147683, |
| "grad_norm": 1.2764623165130615, |
| "learning_rate": 3.5767846762808984e-05, |
| "loss": 0.0529, |
| "num_input_tokens_seen": 315902464, |
| "step": 308500 |
| }, |
| { |
| "epoch": 2.851053228886982, |
| "grad_norm": 0.9110862612724304, |
| "learning_rate": 3.5744779989112485e-05, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 316414464, |
| "step": 309000 |
| }, |
| { |
| "epoch": 2.8556665836262813, |
| "grad_norm": 1.6029491424560547, |
| "learning_rate": 3.5721713215415985e-05, |
| "loss": 0.0524, |
| "num_input_tokens_seen": 316926464, |
| "step": 309500 |
| }, |
| { |
| "epoch": 2.8602799383655806, |
| "grad_norm": 1.162832498550415, |
| "learning_rate": 3.569864644171949e-05, |
| "loss": 0.0497, |
| "num_input_tokens_seen": 317438464, |
| "step": 310000 |
| }, |
| { |
| "epoch": 2.8648932931048803, |
| "grad_norm": 0.8766358494758606, |
| "learning_rate": 3.567557966802299e-05, |
| "loss": 0.0529, |
| "num_input_tokens_seen": 317950464, |
| "step": 310500 |
| }, |
| { |
| "epoch": 2.8695066478441795, |
| "grad_norm": 1.384810209274292, |
| "learning_rate": 3.56525128943265e-05, |
| "loss": 0.0495, |
| "num_input_tokens_seen": 318462464, |
| "step": 311000 |
| }, |
| { |
| "epoch": 2.8741200025834788, |
| "grad_norm": 3.1389269828796387, |
| "learning_rate": 3.562944612063e-05, |
| "loss": 0.0495, |
| "num_input_tokens_seen": 318974464, |
| "step": 311500 |
| }, |
| { |
| "epoch": 2.878733357322778, |
| "grad_norm": 2.004563570022583, |
| "learning_rate": 3.56063793469335e-05, |
| "loss": 0.0498, |
| "num_input_tokens_seen": 319486464, |
| "step": 312000 |
| }, |
| { |
| "epoch": 2.8833467120620773, |
| "grad_norm": 2.8419971466064453, |
| "learning_rate": 3.558331257323701e-05, |
| "loss": 0.0497, |
| "num_input_tokens_seen": 319998464, |
| "step": 312500 |
| }, |
| { |
| "epoch": 2.8879600668013765, |
| "grad_norm": 1.0195252895355225, |
| "learning_rate": 3.556024579954051e-05, |
| "loss": 0.0496, |
| "num_input_tokens_seen": 320510464, |
| "step": 313000 |
| }, |
| { |
| "epoch": 2.8925734215406758, |
| "grad_norm": 1.6460163593292236, |
| "learning_rate": 3.553717902584402e-05, |
| "loss": 0.0465, |
| "num_input_tokens_seen": 321022464, |
| "step": 313500 |
| }, |
| { |
| "epoch": 2.897186776279975, |
| "grad_norm": 0.9986339211463928, |
| "learning_rate": 3.551411225214752e-05, |
| "loss": 0.0494, |
| "num_input_tokens_seen": 321534464, |
| "step": 314000 |
| }, |
| { |
| "epoch": 2.9018001310192747, |
| "grad_norm": 0.7910524606704712, |
| "learning_rate": 3.5491045478451025e-05, |
| "loss": 0.0488, |
| "num_input_tokens_seen": 322046464, |
| "step": 314500 |
| }, |
| { |
| "epoch": 2.906413485758574, |
| "grad_norm": 0.8609081506729126, |
| "learning_rate": 3.5467978704754526e-05, |
| "loss": 0.0522, |
| "num_input_tokens_seen": 322558464, |
| "step": 315000 |
| }, |
| { |
| "epoch": 2.911026840497873, |
| "grad_norm": 0.49892082810401917, |
| "learning_rate": 3.5444911931058026e-05, |
| "loss": 0.0471, |
| "num_input_tokens_seen": 323070464, |
| "step": 315500 |
| }, |
| { |
| "epoch": 2.9156401952371724, |
| "grad_norm": 1.161789894104004, |
| "learning_rate": 3.5421845157361534e-05, |
| "loss": 0.0519, |
| "num_input_tokens_seen": 323582464, |
| "step": 316000 |
| }, |
| { |
| "epoch": 2.920253549976472, |
| "grad_norm": 2.9082627296447754, |
| "learning_rate": 3.539877838366504e-05, |
| "loss": 0.0517, |
| "num_input_tokens_seen": 324094464, |
| "step": 316500 |
| }, |
| { |
| "epoch": 2.9248669047157714, |
| "grad_norm": 2.1669368743896484, |
| "learning_rate": 3.5375711609968535e-05, |
| "loss": 0.0506, |
| "num_input_tokens_seen": 324606464, |
| "step": 317000 |
| }, |
| { |
| "epoch": 2.9294802594550706, |
| "grad_norm": 0.955956220626831, |
| "learning_rate": 3.535264483627204e-05, |
| "loss": 0.0508, |
| "num_input_tokens_seen": 325118464, |
| "step": 317500 |
| }, |
| { |
| "epoch": 2.93409361419437, |
| "grad_norm": 1.6256439685821533, |
| "learning_rate": 3.532957806257555e-05, |
| "loss": 0.0468, |
| "num_input_tokens_seen": 325630464, |
| "step": 318000 |
| }, |
| { |
| "epoch": 2.938706968933669, |
| "grad_norm": 1.479632019996643, |
| "learning_rate": 3.530651128887904e-05, |
| "loss": 0.0468, |
| "num_input_tokens_seen": 326142464, |
| "step": 318500 |
| }, |
| { |
| "epoch": 2.9433203236729684, |
| "grad_norm": 0.8990212082862854, |
| "learning_rate": 3.528344451518255e-05, |
| "loss": 0.0515, |
| "num_input_tokens_seen": 326654464, |
| "step": 319000 |
| }, |
| { |
| "epoch": 2.9479336784122676, |
| "grad_norm": 0.5225000381469727, |
| "learning_rate": 3.526037774148606e-05, |
| "loss": 0.0474, |
| "num_input_tokens_seen": 327166464, |
| "step": 319500 |
| }, |
| { |
| "epoch": 2.9525470331515673, |
| "grad_norm": 0.6462964415550232, |
| "learning_rate": 3.523731096778956e-05, |
| "loss": 0.0523, |
| "num_input_tokens_seen": 327678464, |
| "step": 320000 |
| }, |
| { |
| "epoch": 2.9571603878908665, |
| "grad_norm": 1.1759368181228638, |
| "learning_rate": 3.521424419409306e-05, |
| "loss": 0.0485, |
| "num_input_tokens_seen": 328190464, |
| "step": 320500 |
| }, |
| { |
| "epoch": 2.961773742630166, |
| "grad_norm": 0.6114454865455627, |
| "learning_rate": 3.5191177420396567e-05, |
| "loss": 0.0522, |
| "num_input_tokens_seen": 328702464, |
| "step": 321000 |
| }, |
| { |
| "epoch": 2.966387097369465, |
| "grad_norm": 0.8368657231330872, |
| "learning_rate": 3.516811064670007e-05, |
| "loss": 0.0468, |
| "num_input_tokens_seen": 329214464, |
| "step": 321500 |
| }, |
| { |
| "epoch": 2.9710004521087643, |
| "grad_norm": 0.39750799536705017, |
| "learning_rate": 3.5145043873003574e-05, |
| "loss": 0.0491, |
| "num_input_tokens_seen": 329726464, |
| "step": 322000 |
| }, |
| { |
| "epoch": 2.975613806848064, |
| "grad_norm": 1.4396777153015137, |
| "learning_rate": 3.5121977099307075e-05, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 330238464, |
| "step": 322500 |
| }, |
| { |
| "epoch": 2.9802271615873632, |
| "grad_norm": 6.470019817352295, |
| "learning_rate": 3.5098910325610576e-05, |
| "loss": 0.0466, |
| "num_input_tokens_seen": 330750464, |
| "step": 323000 |
| }, |
| { |
| "epoch": 2.9848405163266625, |
| "grad_norm": 0.8978260159492493, |
| "learning_rate": 3.507584355191408e-05, |
| "loss": 0.051, |
| "num_input_tokens_seen": 331262464, |
| "step": 323500 |
| }, |
| { |
| "epoch": 2.9894538710659617, |
| "grad_norm": 1.2832305431365967, |
| "learning_rate": 3.505277677821759e-05, |
| "loss": 0.05, |
| "num_input_tokens_seen": 331774464, |
| "step": 324000 |
| }, |
| { |
| "epoch": 2.994067225805261, |
| "grad_norm": 1.4465861320495605, |
| "learning_rate": 3.502971000452109e-05, |
| "loss": 0.0491, |
| "num_input_tokens_seen": 332286464, |
| "step": 324500 |
| }, |
| { |
| "epoch": 2.99868058054456, |
| "grad_norm": 0.7884268164634705, |
| "learning_rate": 3.500664323082459e-05, |
| "loss": 0.0559, |
| "num_input_tokens_seen": 332798464, |
| "step": 325000 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_combined_score": 0.07028037235137267, |
| "eval_loss": 0.07028037309646606, |
| "eval_mse": 0.07028037160627928, |
| "eval_runtime": 46.6351, |
| "eval_samples_per_second": 2065.784, |
| "eval_steps_per_second": 258.239, |
| "num_input_tokens_seen": 332944128, |
| "step": 325143 |
| }, |
| { |
| "epoch": 3.00329393528386, |
| "grad_norm": 1.5264211893081665, |
| "learning_rate": 3.49835764571281e-05, |
| "loss": 0.0458, |
| "num_input_tokens_seen": 333309696, |
| "step": 325500 |
| }, |
| { |
| "epoch": 3.007907290023159, |
| "grad_norm": 0.4709686040878296, |
| "learning_rate": 3.49605096834316e-05, |
| "loss": 0.0373, |
| "num_input_tokens_seen": 333821696, |
| "step": 326000 |
| }, |
| { |
| "epoch": 3.0125206447624584, |
| "grad_norm": 1.1726654767990112, |
| "learning_rate": 3.49374429097351e-05, |
| "loss": 0.0367, |
| "num_input_tokens_seen": 334333696, |
| "step": 326500 |
| }, |
| { |
| "epoch": 3.0171339995017576, |
| "grad_norm": 0.5303038358688354, |
| "learning_rate": 3.491437613603861e-05, |
| "loss": 0.0398, |
| "num_input_tokens_seen": 334845696, |
| "step": 327000 |
| }, |
| { |
| "epoch": 3.021747354241057, |
| "grad_norm": 1.8502370119094849, |
| "learning_rate": 3.4891309362342115e-05, |
| "loss": 0.0344, |
| "num_input_tokens_seen": 335357696, |
| "step": 327500 |
| }, |
| { |
| "epoch": 3.026360708980356, |
| "grad_norm": 0.6410061120986938, |
| "learning_rate": 3.486824258864561e-05, |
| "loss": 0.0387, |
| "num_input_tokens_seen": 335869696, |
| "step": 328000 |
| }, |
| { |
| "epoch": 3.030974063719656, |
| "grad_norm": 2.9425787925720215, |
| "learning_rate": 3.4845175814949116e-05, |
| "loss": 0.0408, |
| "num_input_tokens_seen": 336381696, |
| "step": 328500 |
| }, |
| { |
| "epoch": 3.035587418458955, |
| "grad_norm": 3.2158591747283936, |
| "learning_rate": 3.482210904125262e-05, |
| "loss": 0.039, |
| "num_input_tokens_seen": 336893696, |
| "step": 329000 |
| }, |
| { |
| "epoch": 3.0402007731982543, |
| "grad_norm": 1.0993469953536987, |
| "learning_rate": 3.4799042267556124e-05, |
| "loss": 0.0427, |
| "num_input_tokens_seen": 337405696, |
| "step": 329500 |
| }, |
| { |
| "epoch": 3.0448141279375536, |
| "grad_norm": 0.733238697052002, |
| "learning_rate": 3.4775975493859624e-05, |
| "loss": 0.0364, |
| "num_input_tokens_seen": 337917696, |
| "step": 330000 |
| }, |
| { |
| "epoch": 3.049427482676853, |
| "grad_norm": 1.7866772413253784, |
| "learning_rate": 3.475290872016313e-05, |
| "loss": 0.0367, |
| "num_input_tokens_seen": 338429696, |
| "step": 330500 |
| }, |
| { |
| "epoch": 3.054040837416152, |
| "grad_norm": 2.1485824584960938, |
| "learning_rate": 3.472984194646663e-05, |
| "loss": 0.0375, |
| "num_input_tokens_seen": 338941696, |
| "step": 331000 |
| }, |
| { |
| "epoch": 3.0586541921554518, |
| "grad_norm": 0.9480071663856506, |
| "learning_rate": 3.470677517277013e-05, |
| "loss": 0.0361, |
| "num_input_tokens_seen": 339453696, |
| "step": 331500 |
| }, |
| { |
| "epoch": 3.063267546894751, |
| "grad_norm": 1.3875316381454468, |
| "learning_rate": 3.468370839907364e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 339965696, |
| "step": 332000 |
| }, |
| { |
| "epoch": 3.0678809016340503, |
| "grad_norm": 1.2781360149383545, |
| "learning_rate": 3.466064162537714e-05, |
| "loss": 0.0407, |
| "num_input_tokens_seen": 340477696, |
| "step": 332500 |
| }, |
| { |
| "epoch": 3.0724942563733495, |
| "grad_norm": 1.129167079925537, |
| "learning_rate": 3.463757485168065e-05, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 340989696, |
| "step": 333000 |
| }, |
| { |
| "epoch": 3.0771076111126487, |
| "grad_norm": 1.3005669116973877, |
| "learning_rate": 3.461450807798415e-05, |
| "loss": 0.0389, |
| "num_input_tokens_seen": 341501696, |
| "step": 333500 |
| }, |
| { |
| "epoch": 3.0817209658519484, |
| "grad_norm": 1.7916690111160278, |
| "learning_rate": 3.4591441304287656e-05, |
| "loss": 0.0357, |
| "num_input_tokens_seen": 342013696, |
| "step": 334000 |
| }, |
| { |
| "epoch": 3.0863343205912477, |
| "grad_norm": 0.6907594799995422, |
| "learning_rate": 3.456837453059116e-05, |
| "loss": 0.0408, |
| "num_input_tokens_seen": 342525696, |
| "step": 334500 |
| }, |
| { |
| "epoch": 3.090947675330547, |
| "grad_norm": 1.9678852558135986, |
| "learning_rate": 3.4545307756894664e-05, |
| "loss": 0.0394, |
| "num_input_tokens_seen": 343037696, |
| "step": 335000 |
| }, |
| { |
| "epoch": 3.095561030069846, |
| "grad_norm": 2.437412977218628, |
| "learning_rate": 3.4522240983198165e-05, |
| "loss": 0.0374, |
| "num_input_tokens_seen": 343549696, |
| "step": 335500 |
| }, |
| { |
| "epoch": 3.1001743848091454, |
| "grad_norm": 0.7736024260520935, |
| "learning_rate": 3.4499174209501665e-05, |
| "loss": 0.0398, |
| "num_input_tokens_seen": 344061696, |
| "step": 336000 |
| }, |
| { |
| "epoch": 3.1047877395484447, |
| "grad_norm": 1.619535207748413, |
| "learning_rate": 3.447610743580517e-05, |
| "loss": 0.0407, |
| "num_input_tokens_seen": 344573696, |
| "step": 336500 |
| }, |
| { |
| "epoch": 3.1094010942877444, |
| "grad_norm": 0.7229686975479126, |
| "learning_rate": 3.445304066210867e-05, |
| "loss": 0.035, |
| "num_input_tokens_seen": 345085696, |
| "step": 337000 |
| }, |
| { |
| "epoch": 3.1140144490270436, |
| "grad_norm": 0.757798433303833, |
| "learning_rate": 3.4429973888412174e-05, |
| "loss": 0.0356, |
| "num_input_tokens_seen": 345597696, |
| "step": 337500 |
| }, |
| { |
| "epoch": 3.118627803766343, |
| "grad_norm": 1.478723168373108, |
| "learning_rate": 3.440690711471568e-05, |
| "loss": 0.0375, |
| "num_input_tokens_seen": 346109696, |
| "step": 338000 |
| }, |
| { |
| "epoch": 3.123241158505642, |
| "grad_norm": 1.482269525527954, |
| "learning_rate": 3.438384034101919e-05, |
| "loss": 0.0382, |
| "num_input_tokens_seen": 346621696, |
| "step": 338500 |
| }, |
| { |
| "epoch": 3.1278545132449413, |
| "grad_norm": 1.0418490171432495, |
| "learning_rate": 3.436077356732268e-05, |
| "loss": 0.0364, |
| "num_input_tokens_seen": 347133696, |
| "step": 339000 |
| }, |
| { |
| "epoch": 3.1324678679842406, |
| "grad_norm": 0.8459765911102295, |
| "learning_rate": 3.433770679362619e-05, |
| "loss": 0.0355, |
| "num_input_tokens_seen": 347645696, |
| "step": 339500 |
| }, |
| { |
| "epoch": 3.1370812227235403, |
| "grad_norm": 0.91368168592453, |
| "learning_rate": 3.43146400199297e-05, |
| "loss": 0.0384, |
| "num_input_tokens_seen": 348157696, |
| "step": 340000 |
| }, |
| { |
| "epoch": 3.1416945774628395, |
| "grad_norm": 1.1992415189743042, |
| "learning_rate": 3.42915732462332e-05, |
| "loss": 0.0402, |
| "num_input_tokens_seen": 348669696, |
| "step": 340500 |
| }, |
| { |
| "epoch": 3.146307932202139, |
| "grad_norm": 1.1619198322296143, |
| "learning_rate": 3.42685064725367e-05, |
| "loss": 0.0401, |
| "num_input_tokens_seen": 349181696, |
| "step": 341000 |
| }, |
| { |
| "epoch": 3.150921286941438, |
| "grad_norm": 0.8243937492370605, |
| "learning_rate": 3.4245439698840206e-05, |
| "loss": 0.039, |
| "num_input_tokens_seen": 349693696, |
| "step": 341500 |
| }, |
| { |
| "epoch": 3.1555346416807373, |
| "grad_norm": 1.217475175857544, |
| "learning_rate": 3.4222372925143706e-05, |
| "loss": 0.0392, |
| "num_input_tokens_seen": 350205696, |
| "step": 342000 |
| }, |
| { |
| "epoch": 3.1601479964200365, |
| "grad_norm": 1.7150335311889648, |
| "learning_rate": 3.4199306151447214e-05, |
| "loss": 0.0352, |
| "num_input_tokens_seen": 350717696, |
| "step": 342500 |
| }, |
| { |
| "epoch": 3.164761351159336, |
| "grad_norm": 0.892362117767334, |
| "learning_rate": 3.4176239377750714e-05, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 351229696, |
| "step": 343000 |
| }, |
| { |
| "epoch": 3.1693747058986355, |
| "grad_norm": 0.5353464484214783, |
| "learning_rate": 3.4153172604054215e-05, |
| "loss": 0.0378, |
| "num_input_tokens_seen": 351741696, |
| "step": 343500 |
| }, |
| { |
| "epoch": 3.1739880606379347, |
| "grad_norm": 1.603272557258606, |
| "learning_rate": 3.413010583035772e-05, |
| "loss": 0.0401, |
| "num_input_tokens_seen": 352253696, |
| "step": 344000 |
| }, |
| { |
| "epoch": 3.178601415377234, |
| "grad_norm": 1.0198638439178467, |
| "learning_rate": 3.410703905666122e-05, |
| "loss": 0.0364, |
| "num_input_tokens_seen": 352765696, |
| "step": 344500 |
| }, |
| { |
| "epoch": 3.183214770116533, |
| "grad_norm": 0.7820620536804199, |
| "learning_rate": 3.408397228296473e-05, |
| "loss": 0.038, |
| "num_input_tokens_seen": 353277696, |
| "step": 345000 |
| }, |
| { |
| "epoch": 3.187828124855833, |
| "grad_norm": 1.567887306213379, |
| "learning_rate": 3.406090550926823e-05, |
| "loss": 0.0368, |
| "num_input_tokens_seen": 353789696, |
| "step": 345500 |
| }, |
| { |
| "epoch": 3.192441479595132, |
| "grad_norm": 1.5703437328338623, |
| "learning_rate": 3.403783873557174e-05, |
| "loss": 0.0385, |
| "num_input_tokens_seen": 354301696, |
| "step": 346000 |
| }, |
| { |
| "epoch": 3.1970548343344314, |
| "grad_norm": 0.5745303630828857, |
| "learning_rate": 3.401477196187524e-05, |
| "loss": 0.0368, |
| "num_input_tokens_seen": 354813696, |
| "step": 346500 |
| }, |
| { |
| "epoch": 3.2016681890737306, |
| "grad_norm": 0.9760965704917908, |
| "learning_rate": 3.399170518817874e-05, |
| "loss": 0.0414, |
| "num_input_tokens_seen": 355325696, |
| "step": 347000 |
| }, |
| { |
| "epoch": 3.20628154381303, |
| "grad_norm": 1.1067168712615967, |
| "learning_rate": 3.3968638414482246e-05, |
| "loss": 0.0379, |
| "num_input_tokens_seen": 355837696, |
| "step": 347500 |
| }, |
| { |
| "epoch": 3.210894898552329, |
| "grad_norm": 1.1161097288131714, |
| "learning_rate": 3.3945571640785754e-05, |
| "loss": 0.0384, |
| "num_input_tokens_seen": 356349696, |
| "step": 348000 |
| }, |
| { |
| "epoch": 3.2155082532916284, |
| "grad_norm": 2.1467411518096924, |
| "learning_rate": 3.392250486708925e-05, |
| "loss": 0.0387, |
| "num_input_tokens_seen": 356861696, |
| "step": 348500 |
| }, |
| { |
| "epoch": 3.220121608030928, |
| "grad_norm": 1.2950456142425537, |
| "learning_rate": 3.3899438093392755e-05, |
| "loss": 0.0362, |
| "num_input_tokens_seen": 357373696, |
| "step": 349000 |
| }, |
| { |
| "epoch": 3.2247349627702273, |
| "grad_norm": 1.0559481382369995, |
| "learning_rate": 3.387637131969626e-05, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 357885696, |
| "step": 349500 |
| }, |
| { |
| "epoch": 3.2293483175095266, |
| "grad_norm": 1.2557491064071655, |
| "learning_rate": 3.385330454599976e-05, |
| "loss": 0.0409, |
| "num_input_tokens_seen": 358397696, |
| "step": 350000 |
| }, |
| { |
| "epoch": 3.233961672248826, |
| "grad_norm": 0.9372035264968872, |
| "learning_rate": 3.3830237772303264e-05, |
| "loss": 0.0404, |
| "num_input_tokens_seen": 358909696, |
| "step": 350500 |
| }, |
| { |
| "epoch": 3.238575026988125, |
| "grad_norm": 0.6541593670845032, |
| "learning_rate": 3.380717099860677e-05, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 359421696, |
| "step": 351000 |
| }, |
| { |
| "epoch": 3.2431883817274247, |
| "grad_norm": 0.9174505472183228, |
| "learning_rate": 3.378410422491027e-05, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 359933696, |
| "step": 351500 |
| }, |
| { |
| "epoch": 3.247801736466724, |
| "grad_norm": 0.9051727056503296, |
| "learning_rate": 3.376103745121377e-05, |
| "loss": 0.0375, |
| "num_input_tokens_seen": 360445696, |
| "step": 352000 |
| }, |
| { |
| "epoch": 3.2524150912060232, |
| "grad_norm": 1.1875522136688232, |
| "learning_rate": 3.373797067751728e-05, |
| "loss": 0.0431, |
| "num_input_tokens_seen": 360957696, |
| "step": 352500 |
| }, |
| { |
| "epoch": 3.2570284459453225, |
| "grad_norm": 0.1862681657075882, |
| "learning_rate": 3.371490390382078e-05, |
| "loss": 0.0385, |
| "num_input_tokens_seen": 361469696, |
| "step": 353000 |
| }, |
| { |
| "epoch": 3.2616418006846217, |
| "grad_norm": 1.5912601947784424, |
| "learning_rate": 3.369183713012429e-05, |
| "loss": 0.0371, |
| "num_input_tokens_seen": 361981696, |
| "step": 353500 |
| }, |
| { |
| "epoch": 3.266255155423921, |
| "grad_norm": 1.4725751876831055, |
| "learning_rate": 3.366877035642779e-05, |
| "loss": 0.0417, |
| "num_input_tokens_seen": 362493696, |
| "step": 354000 |
| }, |
| { |
| "epoch": 3.2708685101632207, |
| "grad_norm": 0.7821846604347229, |
| "learning_rate": 3.364570358273129e-05, |
| "loss": 0.0371, |
| "num_input_tokens_seen": 363005696, |
| "step": 354500 |
| }, |
| { |
| "epoch": 3.27548186490252, |
| "grad_norm": 1.3403239250183105, |
| "learning_rate": 3.3622636809034796e-05, |
| "loss": 0.0437, |
| "num_input_tokens_seen": 363517696, |
| "step": 355000 |
| }, |
| { |
| "epoch": 3.280095219641819, |
| "grad_norm": 1.3142443895339966, |
| "learning_rate": 3.35995700353383e-05, |
| "loss": 0.0424, |
| "num_input_tokens_seen": 364029696, |
| "step": 355500 |
| }, |
| { |
| "epoch": 3.2847085743811184, |
| "grad_norm": 0.7003629207611084, |
| "learning_rate": 3.3576503261641804e-05, |
| "loss": 0.038, |
| "num_input_tokens_seen": 364541696, |
| "step": 356000 |
| }, |
| { |
| "epoch": 3.2893219291204177, |
| "grad_norm": 2.1016480922698975, |
| "learning_rate": 3.3553436487945304e-05, |
| "loss": 0.0389, |
| "num_input_tokens_seen": 365053696, |
| "step": 356500 |
| }, |
| { |
| "epoch": 3.2939352838597173, |
| "grad_norm": 0.9255128502845764, |
| "learning_rate": 3.353036971424881e-05, |
| "loss": 0.0414, |
| "num_input_tokens_seen": 365565696, |
| "step": 357000 |
| }, |
| { |
| "epoch": 3.2985486385990166, |
| "grad_norm": 2.0615665912628174, |
| "learning_rate": 3.350730294055231e-05, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 366077696, |
| "step": 357500 |
| }, |
| { |
| "epoch": 3.303161993338316, |
| "grad_norm": 0.5057035088539124, |
| "learning_rate": 3.348423616685581e-05, |
| "loss": 0.0441, |
| "num_input_tokens_seen": 366589696, |
| "step": 358000 |
| }, |
| { |
| "epoch": 3.307775348077615, |
| "grad_norm": 2.8129680156707764, |
| "learning_rate": 3.346116939315932e-05, |
| "loss": 0.0368, |
| "num_input_tokens_seen": 367101696, |
| "step": 358500 |
| }, |
| { |
| "epoch": 3.3123887028169143, |
| "grad_norm": 2.223184823989868, |
| "learning_rate": 3.343810261946283e-05, |
| "loss": 0.0423, |
| "num_input_tokens_seen": 367613696, |
| "step": 359000 |
| }, |
| { |
| "epoch": 3.3170020575562136, |
| "grad_norm": 1.127394199371338, |
| "learning_rate": 3.341503584576632e-05, |
| "loss": 0.0397, |
| "num_input_tokens_seen": 368125696, |
| "step": 359500 |
| }, |
| { |
| "epoch": 3.321615412295513, |
| "grad_norm": 2.887812376022339, |
| "learning_rate": 3.339196907206983e-05, |
| "loss": 0.0379, |
| "num_input_tokens_seen": 368637696, |
| "step": 360000 |
| }, |
| { |
| "epoch": 3.3262287670348125, |
| "grad_norm": 1.08502197265625, |
| "learning_rate": 3.3368902298373336e-05, |
| "loss": 0.0421, |
| "num_input_tokens_seen": 369149696, |
| "step": 360500 |
| }, |
| { |
| "epoch": 3.3308421217741118, |
| "grad_norm": 1.0474424362182617, |
| "learning_rate": 3.334583552467684e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 369661696, |
| "step": 361000 |
| }, |
| { |
| "epoch": 3.335455476513411, |
| "grad_norm": 0.7261756658554077, |
| "learning_rate": 3.332276875098034e-05, |
| "loss": 0.0409, |
| "num_input_tokens_seen": 370173696, |
| "step": 361500 |
| }, |
| { |
| "epoch": 3.3400688312527103, |
| "grad_norm": 0.6790010929107666, |
| "learning_rate": 3.3299701977283845e-05, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 370685696, |
| "step": 362000 |
| }, |
| { |
| "epoch": 3.3446821859920095, |
| "grad_norm": 1.7215800285339355, |
| "learning_rate": 3.3276635203587345e-05, |
| "loss": 0.0411, |
| "num_input_tokens_seen": 371197696, |
| "step": 362500 |
| }, |
| { |
| "epoch": 3.349295540731309, |
| "grad_norm": 1.112464189529419, |
| "learning_rate": 3.325356842989085e-05, |
| "loss": 0.0421, |
| "num_input_tokens_seen": 371709696, |
| "step": 363000 |
| }, |
| { |
| "epoch": 3.3539088954706084, |
| "grad_norm": 1.0138994455337524, |
| "learning_rate": 3.323050165619435e-05, |
| "loss": 0.0369, |
| "num_input_tokens_seen": 372221696, |
| "step": 363500 |
| }, |
| { |
| "epoch": 3.3585222502099077, |
| "grad_norm": 0.584247887134552, |
| "learning_rate": 3.3207434882497854e-05, |
| "loss": 0.0402, |
| "num_input_tokens_seen": 372733696, |
| "step": 364000 |
| }, |
| { |
| "epoch": 3.363135604949207, |
| "grad_norm": 1.9375905990600586, |
| "learning_rate": 3.318436810880136e-05, |
| "loss": 0.0359, |
| "num_input_tokens_seen": 373245696, |
| "step": 364500 |
| }, |
| { |
| "epoch": 3.367748959688506, |
| "grad_norm": 1.225064992904663, |
| "learning_rate": 3.316130133510486e-05, |
| "loss": 0.0378, |
| "num_input_tokens_seen": 373757696, |
| "step": 365000 |
| }, |
| { |
| "epoch": 3.3723623144278054, |
| "grad_norm": 1.0532304048538208, |
| "learning_rate": 3.313823456140836e-05, |
| "loss": 0.0422, |
| "num_input_tokens_seen": 374269696, |
| "step": 365500 |
| }, |
| { |
| "epoch": 3.376975669167105, |
| "grad_norm": 0.950737714767456, |
| "learning_rate": 3.311516778771187e-05, |
| "loss": 0.0385, |
| "num_input_tokens_seen": 374781696, |
| "step": 366000 |
| }, |
| { |
| "epoch": 3.3815890239064044, |
| "grad_norm": 0.340679794549942, |
| "learning_rate": 3.309210101401538e-05, |
| "loss": 0.0364, |
| "num_input_tokens_seen": 375293696, |
| "step": 366500 |
| }, |
| { |
| "epoch": 3.3862023786457036, |
| "grad_norm": 4.747739791870117, |
| "learning_rate": 3.306903424031888e-05, |
| "loss": 0.0354, |
| "num_input_tokens_seen": 375805696, |
| "step": 367000 |
| }, |
| { |
| "epoch": 3.390815733385003, |
| "grad_norm": 1.7227208614349365, |
| "learning_rate": 3.304596746662238e-05, |
| "loss": 0.0413, |
| "num_input_tokens_seen": 376317696, |
| "step": 367500 |
| }, |
| { |
| "epoch": 3.395429088124302, |
| "grad_norm": 1.4410547018051147, |
| "learning_rate": 3.3022900692925886e-05, |
| "loss": 0.0359, |
| "num_input_tokens_seen": 376829696, |
| "step": 368000 |
| }, |
| { |
| "epoch": 3.400042442863602, |
| "grad_norm": 0.847284197807312, |
| "learning_rate": 3.2999833919229386e-05, |
| "loss": 0.0437, |
| "num_input_tokens_seen": 377341696, |
| "step": 368500 |
| }, |
| { |
| "epoch": 3.404655797602901, |
| "grad_norm": 1.7439848184585571, |
| "learning_rate": 3.297676714553289e-05, |
| "loss": 0.0362, |
| "num_input_tokens_seen": 377853696, |
| "step": 369000 |
| }, |
| { |
| "epoch": 3.4092691523422003, |
| "grad_norm": 0.6023704409599304, |
| "learning_rate": 3.2953700371836394e-05, |
| "loss": 0.0418, |
| "num_input_tokens_seen": 378365696, |
| "step": 369500 |
| }, |
| { |
| "epoch": 3.4138825070814995, |
| "grad_norm": 0.3590753972530365, |
| "learning_rate": 3.29306335981399e-05, |
| "loss": 0.0402, |
| "num_input_tokens_seen": 378877696, |
| "step": 370000 |
| }, |
| { |
| "epoch": 3.418495861820799, |
| "grad_norm": 1.0211530923843384, |
| "learning_rate": 3.2907566824443395e-05, |
| "loss": 0.0374, |
| "num_input_tokens_seen": 379389696, |
| "step": 370500 |
| }, |
| { |
| "epoch": 3.423109216560098, |
| "grad_norm": 0.9513002038002014, |
| "learning_rate": 3.28845000507469e-05, |
| "loss": 0.0401, |
| "num_input_tokens_seen": 379901696, |
| "step": 371000 |
| }, |
| { |
| "epoch": 3.4277225712993973, |
| "grad_norm": 1.0161465406417847, |
| "learning_rate": 3.286143327705041e-05, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 380413696, |
| "step": 371500 |
| }, |
| { |
| "epoch": 3.432335926038697, |
| "grad_norm": 1.2249014377593994, |
| "learning_rate": 3.283836650335391e-05, |
| "loss": 0.0401, |
| "num_input_tokens_seen": 380925696, |
| "step": 372000 |
| }, |
| { |
| "epoch": 3.436949280777996, |
| "grad_norm": 1.3249224424362183, |
| "learning_rate": 3.281529972965741e-05, |
| "loss": 0.0414, |
| "num_input_tokens_seen": 381437696, |
| "step": 372500 |
| }, |
| { |
| "epoch": 3.4415626355172955, |
| "grad_norm": 3.6392204761505127, |
| "learning_rate": 3.279223295596092e-05, |
| "loss": 0.0367, |
| "num_input_tokens_seen": 381949696, |
| "step": 373000 |
| }, |
| { |
| "epoch": 3.4461759902565947, |
| "grad_norm": 0.9922639727592468, |
| "learning_rate": 3.276916618226442e-05, |
| "loss": 0.0418, |
| "num_input_tokens_seen": 382461696, |
| "step": 373500 |
| }, |
| { |
| "epoch": 3.450789344995894, |
| "grad_norm": 2.1645193099975586, |
| "learning_rate": 3.2746099408567926e-05, |
| "loss": 0.0382, |
| "num_input_tokens_seen": 382973696, |
| "step": 374000 |
| }, |
| { |
| "epoch": 3.4554026997351937, |
| "grad_norm": 2.5222291946411133, |
| "learning_rate": 3.272303263487143e-05, |
| "loss": 0.0399, |
| "num_input_tokens_seen": 383485696, |
| "step": 374500 |
| }, |
| { |
| "epoch": 3.460016054474493, |
| "grad_norm": 2.2609009742736816, |
| "learning_rate": 3.269996586117493e-05, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 383997696, |
| "step": 375000 |
| }, |
| { |
| "epoch": 3.464629409213792, |
| "grad_norm": 3.2856132984161377, |
| "learning_rate": 3.2676899087478435e-05, |
| "loss": 0.0391, |
| "num_input_tokens_seen": 384509696, |
| "step": 375500 |
| }, |
| { |
| "epoch": 3.4692427639530914, |
| "grad_norm": 0.6138939261436462, |
| "learning_rate": 3.265383231378194e-05, |
| "loss": 0.0398, |
| "num_input_tokens_seen": 385021696, |
| "step": 376000 |
| }, |
| { |
| "epoch": 3.4738561186923906, |
| "grad_norm": 1.3824810981750488, |
| "learning_rate": 3.263076554008544e-05, |
| "loss": 0.0374, |
| "num_input_tokens_seen": 385533696, |
| "step": 376500 |
| }, |
| { |
| "epoch": 3.47846947343169, |
| "grad_norm": 1.539600133895874, |
| "learning_rate": 3.2607698766388943e-05, |
| "loss": 0.0397, |
| "num_input_tokens_seen": 386045696, |
| "step": 377000 |
| }, |
| { |
| "epoch": 3.483082828170989, |
| "grad_norm": 0.7915021181106567, |
| "learning_rate": 3.258463199269245e-05, |
| "loss": 0.0408, |
| "num_input_tokens_seen": 386557696, |
| "step": 377500 |
| }, |
| { |
| "epoch": 3.487696182910289, |
| "grad_norm": 1.5975933074951172, |
| "learning_rate": 3.256156521899595e-05, |
| "loss": 0.0382, |
| "num_input_tokens_seen": 387069696, |
| "step": 378000 |
| }, |
| { |
| "epoch": 3.492309537649588, |
| "grad_norm": 1.8749665021896362, |
| "learning_rate": 3.253849844529945e-05, |
| "loss": 0.0407, |
| "num_input_tokens_seen": 387581696, |
| "step": 378500 |
| }, |
| { |
| "epoch": 3.4969228923888873, |
| "grad_norm": 1.7674627304077148, |
| "learning_rate": 3.251543167160296e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 388093696, |
| "step": 379000 |
| }, |
| { |
| "epoch": 3.5015362471281866, |
| "grad_norm": 0.8147306442260742, |
| "learning_rate": 3.249236489790646e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 388605696, |
| "step": 379500 |
| }, |
| { |
| "epoch": 3.5061496018674863, |
| "grad_norm": 0.7411497235298157, |
| "learning_rate": 3.246929812420996e-05, |
| "loss": 0.0394, |
| "num_input_tokens_seen": 389117696, |
| "step": 380000 |
| }, |
| { |
| "epoch": 3.5107629566067855, |
| "grad_norm": 1.145559549331665, |
| "learning_rate": 3.244623135051347e-05, |
| "loss": 0.0432, |
| "num_input_tokens_seen": 389629696, |
| "step": 380500 |
| }, |
| { |
| "epoch": 3.5153763113460847, |
| "grad_norm": 1.1018445491790771, |
| "learning_rate": 3.2423164576816975e-05, |
| "loss": 0.0426, |
| "num_input_tokens_seen": 390141696, |
| "step": 381000 |
| }, |
| { |
| "epoch": 3.519989666085384, |
| "grad_norm": 5.711886882781982, |
| "learning_rate": 3.2400097803120476e-05, |
| "loss": 0.0362, |
| "num_input_tokens_seen": 390653696, |
| "step": 381500 |
| }, |
| { |
| "epoch": 3.5246030208246832, |
| "grad_norm": 5.521966934204102, |
| "learning_rate": 3.2377031029423976e-05, |
| "loss": 0.0445, |
| "num_input_tokens_seen": 391165696, |
| "step": 382000 |
| }, |
| { |
| "epoch": 3.5292163755639825, |
| "grad_norm": 1.7097331285476685, |
| "learning_rate": 3.2353964255727484e-05, |
| "loss": 0.0394, |
| "num_input_tokens_seen": 391677696, |
| "step": 382500 |
| }, |
| { |
| "epoch": 3.5338297303032817, |
| "grad_norm": 2.794013023376465, |
| "learning_rate": 3.2330897482030984e-05, |
| "loss": 0.0418, |
| "num_input_tokens_seen": 392189696, |
| "step": 383000 |
| }, |
| { |
| "epoch": 3.5384430850425814, |
| "grad_norm": 0.8009048700332642, |
| "learning_rate": 3.2307830708334485e-05, |
| "loss": 0.0402, |
| "num_input_tokens_seen": 392701696, |
| "step": 383500 |
| }, |
| { |
| "epoch": 3.5430564397818807, |
| "grad_norm": 1.5974643230438232, |
| "learning_rate": 3.228476393463799e-05, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 393213696, |
| "step": 384000 |
| }, |
| { |
| "epoch": 3.54766979452118, |
| "grad_norm": 2.538250207901001, |
| "learning_rate": 3.226169716094149e-05, |
| "loss": 0.0401, |
| "num_input_tokens_seen": 393725696, |
| "step": 384500 |
| }, |
| { |
| "epoch": 3.552283149260479, |
| "grad_norm": 1.2976337671279907, |
| "learning_rate": 3.2238630387245e-05, |
| "loss": 0.0379, |
| "num_input_tokens_seen": 394237696, |
| "step": 385000 |
| }, |
| { |
| "epoch": 3.5568965039997784, |
| "grad_norm": 1.1865109205245972, |
| "learning_rate": 3.22155636135485e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 394749696, |
| "step": 385500 |
| }, |
| { |
| "epoch": 3.561509858739078, |
| "grad_norm": 0.36470434069633484, |
| "learning_rate": 3.2192496839852e-05, |
| "loss": 0.0399, |
| "num_input_tokens_seen": 395261696, |
| "step": 386000 |
| }, |
| { |
| "epoch": 3.5661232134783774, |
| "grad_norm": 2.1635212898254395, |
| "learning_rate": 3.216943006615551e-05, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 395773696, |
| "step": 386500 |
| }, |
| { |
| "epoch": 3.5707365682176766, |
| "grad_norm": 1.7805256843566895, |
| "learning_rate": 3.2146363292459016e-05, |
| "loss": 0.0391, |
| "num_input_tokens_seen": 396285696, |
| "step": 387000 |
| }, |
| { |
| "epoch": 3.575349922956976, |
| "grad_norm": 1.5320919752120972, |
| "learning_rate": 3.212329651876252e-05, |
| "loss": 0.0417, |
| "num_input_tokens_seen": 396797696, |
| "step": 387500 |
| }, |
| { |
| "epoch": 3.579963277696275, |
| "grad_norm": 3.523890733718872, |
| "learning_rate": 3.210022974506602e-05, |
| "loss": 0.0394, |
| "num_input_tokens_seen": 397309696, |
| "step": 388000 |
| }, |
| { |
| "epoch": 3.5845766324355743, |
| "grad_norm": 1.2910226583480835, |
| "learning_rate": 3.2077162971369525e-05, |
| "loss": 0.0397, |
| "num_input_tokens_seen": 397821696, |
| "step": 388500 |
| }, |
| { |
| "epoch": 3.5891899871748736, |
| "grad_norm": 1.5501660108566284, |
| "learning_rate": 3.2054096197673025e-05, |
| "loss": 0.0396, |
| "num_input_tokens_seen": 398333696, |
| "step": 389000 |
| }, |
| { |
| "epoch": 3.5938033419141733, |
| "grad_norm": 1.1182091236114502, |
| "learning_rate": 3.2031029423976526e-05, |
| "loss": 0.0421, |
| "num_input_tokens_seen": 398845696, |
| "step": 389500 |
| }, |
| { |
| "epoch": 3.5984166966534725, |
| "grad_norm": 1.5010899305343628, |
| "learning_rate": 3.200796265028003e-05, |
| "loss": 0.038, |
| "num_input_tokens_seen": 399357696, |
| "step": 390000 |
| }, |
| { |
| "epoch": 3.6030300513927718, |
| "grad_norm": 0.4965997040271759, |
| "learning_rate": 3.198489587658354e-05, |
| "loss": 0.0392, |
| "num_input_tokens_seen": 399869696, |
| "step": 390500 |
| }, |
| { |
| "epoch": 3.607643406132071, |
| "grad_norm": 0.735758364200592, |
| "learning_rate": 3.1961829102887034e-05, |
| "loss": 0.0375, |
| "num_input_tokens_seen": 400381696, |
| "step": 391000 |
| }, |
| { |
| "epoch": 3.6122567608713707, |
| "grad_norm": 0.9119324684143066, |
| "learning_rate": 3.193876232919054e-05, |
| "loss": 0.0397, |
| "num_input_tokens_seen": 400893696, |
| "step": 391500 |
| }, |
| { |
| "epoch": 3.61687011561067, |
| "grad_norm": 1.0355151891708374, |
| "learning_rate": 3.191569555549405e-05, |
| "loss": 0.0379, |
| "num_input_tokens_seen": 401405696, |
| "step": 392000 |
| }, |
| { |
| "epoch": 3.621483470349969, |
| "grad_norm": 1.574038028717041, |
| "learning_rate": 3.189262878179755e-05, |
| "loss": 0.0398, |
| "num_input_tokens_seen": 401917696, |
| "step": 392500 |
| }, |
| { |
| "epoch": 3.6260968250892684, |
| "grad_norm": 1.9339407682418823, |
| "learning_rate": 3.186956200810105e-05, |
| "loss": 0.0366, |
| "num_input_tokens_seen": 402429696, |
| "step": 393000 |
| }, |
| { |
| "epoch": 3.6307101798285677, |
| "grad_norm": 1.808971643447876, |
| "learning_rate": 3.184649523440456e-05, |
| "loss": 0.0433, |
| "num_input_tokens_seen": 402941696, |
| "step": 393500 |
| }, |
| { |
| "epoch": 3.635323534567867, |
| "grad_norm": 0.8877146244049072, |
| "learning_rate": 3.182342846070806e-05, |
| "loss": 0.0402, |
| "num_input_tokens_seen": 403453696, |
| "step": 394000 |
| }, |
| { |
| "epoch": 3.639936889307166, |
| "grad_norm": 1.4622044563293457, |
| "learning_rate": 3.1800361687011566e-05, |
| "loss": 0.0429, |
| "num_input_tokens_seen": 403965696, |
| "step": 394500 |
| }, |
| { |
| "epoch": 3.6445502440464654, |
| "grad_norm": 1.1509592533111572, |
| "learning_rate": 3.1777294913315066e-05, |
| "loss": 0.0378, |
| "num_input_tokens_seen": 404477696, |
| "step": 395000 |
| }, |
| { |
| "epoch": 3.649163598785765, |
| "grad_norm": 1.6934188604354858, |
| "learning_rate": 3.175422813961857e-05, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 404989696, |
| "step": 395500 |
| }, |
| { |
| "epoch": 3.6537769535250644, |
| "grad_norm": 2.861666202545166, |
| "learning_rate": 3.1731161365922074e-05, |
| "loss": 0.0382, |
| "num_input_tokens_seen": 405501696, |
| "step": 396000 |
| }, |
| { |
| "epoch": 3.6583903082643636, |
| "grad_norm": 1.3087468147277832, |
| "learning_rate": 3.1708094592225575e-05, |
| "loss": 0.0387, |
| "num_input_tokens_seen": 406013696, |
| "step": 396500 |
| }, |
| { |
| "epoch": 3.663003663003663, |
| "grad_norm": 0.8184057474136353, |
| "learning_rate": 3.1685027818529075e-05, |
| "loss": 0.0436, |
| "num_input_tokens_seen": 406525696, |
| "step": 397000 |
| }, |
| { |
| "epoch": 3.6676170177429626, |
| "grad_norm": 1.3447506427764893, |
| "learning_rate": 3.166196104483258e-05, |
| "loss": 0.0387, |
| "num_input_tokens_seen": 407037696, |
| "step": 397500 |
| }, |
| { |
| "epoch": 3.672230372482262, |
| "grad_norm": 1.8640304803848267, |
| "learning_rate": 3.163889427113609e-05, |
| "loss": 0.0427, |
| "num_input_tokens_seen": 407549696, |
| "step": 398000 |
| }, |
| { |
| "epoch": 3.676843727221561, |
| "grad_norm": 6.683871746063232, |
| "learning_rate": 3.161582749743959e-05, |
| "loss": 0.0413, |
| "num_input_tokens_seen": 408061696, |
| "step": 398500 |
| }, |
| { |
| "epoch": 3.6814570819608603, |
| "grad_norm": 0.6029996275901794, |
| "learning_rate": 3.159276072374309e-05, |
| "loss": 0.0428, |
| "num_input_tokens_seen": 408573696, |
| "step": 399000 |
| }, |
| { |
| "epoch": 3.6860704367001595, |
| "grad_norm": 0.6650155782699585, |
| "learning_rate": 3.15696939500466e-05, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 409085696, |
| "step": 399500 |
| }, |
| { |
| "epoch": 3.690683791439459, |
| "grad_norm": 0.6915871500968933, |
| "learning_rate": 3.15466271763501e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 409597696, |
| "step": 400000 |
| }, |
| { |
| "epoch": 3.695297146178758, |
| "grad_norm": 0.9651739597320557, |
| "learning_rate": 3.15235604026536e-05, |
| "loss": 0.0388, |
| "num_input_tokens_seen": 410109696, |
| "step": 400500 |
| }, |
| { |
| "epoch": 3.6999105009180577, |
| "grad_norm": 1.2852321863174438, |
| "learning_rate": 3.150049362895711e-05, |
| "loss": 0.0436, |
| "num_input_tokens_seen": 410621696, |
| "step": 401000 |
| }, |
| { |
| "epoch": 3.704523855657357, |
| "grad_norm": 1.250339150428772, |
| "learning_rate": 3.1477426855260614e-05, |
| "loss": 0.0371, |
| "num_input_tokens_seen": 411133696, |
| "step": 401500 |
| }, |
| { |
| "epoch": 3.7091372103966562, |
| "grad_norm": 0.9992502927780151, |
| "learning_rate": 3.1454360081564115e-05, |
| "loss": 0.0413, |
| "num_input_tokens_seen": 411645696, |
| "step": 402000 |
| }, |
| { |
| "epoch": 3.7137505651359555, |
| "grad_norm": 3.6451685428619385, |
| "learning_rate": 3.1431293307867615e-05, |
| "loss": 0.0425, |
| "num_input_tokens_seen": 412157696, |
| "step": 402500 |
| }, |
| { |
| "epoch": 3.718363919875255, |
| "grad_norm": 0.49393585324287415, |
| "learning_rate": 3.140822653417112e-05, |
| "loss": 0.0414, |
| "num_input_tokens_seen": 412669696, |
| "step": 403000 |
| }, |
| { |
| "epoch": 3.7229772746145544, |
| "grad_norm": 1.5764920711517334, |
| "learning_rate": 3.1385159760474623e-05, |
| "loss": 0.0373, |
| "num_input_tokens_seen": 413181696, |
| "step": 403500 |
| }, |
| { |
| "epoch": 3.7275906293538537, |
| "grad_norm": 2.7465178966522217, |
| "learning_rate": 3.1362092986778124e-05, |
| "loss": 0.0418, |
| "num_input_tokens_seen": 413693696, |
| "step": 404000 |
| }, |
| { |
| "epoch": 3.732203984093153, |
| "grad_norm": 2.4784648418426514, |
| "learning_rate": 3.133902621308163e-05, |
| "loss": 0.0373, |
| "num_input_tokens_seen": 414205696, |
| "step": 404500 |
| }, |
| { |
| "epoch": 3.736817338832452, |
| "grad_norm": 1.1435418128967285, |
| "learning_rate": 3.131595943938513e-05, |
| "loss": 0.0393, |
| "num_input_tokens_seen": 414717696, |
| "step": 405000 |
| }, |
| { |
| "epoch": 3.7414306935717514, |
| "grad_norm": 3.1641488075256348, |
| "learning_rate": 3.129289266568864e-05, |
| "loss": 0.0378, |
| "num_input_tokens_seen": 415229696, |
| "step": 405500 |
| }, |
| { |
| "epoch": 3.7460440483110506, |
| "grad_norm": 1.299619436264038, |
| "learning_rate": 3.126982589199214e-05, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 415741696, |
| "step": 406000 |
| }, |
| { |
| "epoch": 3.75065740305035, |
| "grad_norm": 1.7014168500900269, |
| "learning_rate": 3.124675911829564e-05, |
| "loss": 0.0448, |
| "num_input_tokens_seen": 416253696, |
| "step": 406500 |
| }, |
| { |
| "epoch": 3.7552707577896496, |
| "grad_norm": 1.5592892169952393, |
| "learning_rate": 3.122369234459915e-05, |
| "loss": 0.038, |
| "num_input_tokens_seen": 416765696, |
| "step": 407000 |
| }, |
| { |
| "epoch": 3.759884112528949, |
| "grad_norm": 0.6049352884292603, |
| "learning_rate": 3.1200625570902655e-05, |
| "loss": 0.039, |
| "num_input_tokens_seen": 417277696, |
| "step": 407500 |
| }, |
| { |
| "epoch": 3.764497467268248, |
| "grad_norm": 0.6392286419868469, |
| "learning_rate": 3.117755879720615e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 417789696, |
| "step": 408000 |
| }, |
| { |
| "epoch": 3.7691108220075473, |
| "grad_norm": 3.689347505569458, |
| "learning_rate": 3.1154492023509656e-05, |
| "loss": 0.0385, |
| "num_input_tokens_seen": 418301696, |
| "step": 408500 |
| }, |
| { |
| "epoch": 3.773724176746847, |
| "grad_norm": 0.8414890766143799, |
| "learning_rate": 3.1131425249813164e-05, |
| "loss": 0.0366, |
| "num_input_tokens_seen": 418813696, |
| "step": 409000 |
| }, |
| { |
| "epoch": 3.7783375314861463, |
| "grad_norm": 5.263124465942383, |
| "learning_rate": 3.1108358476116664e-05, |
| "loss": 0.0406, |
| "num_input_tokens_seen": 419325696, |
| "step": 409500 |
| }, |
| { |
| "epoch": 3.7829508862254455, |
| "grad_norm": 1.395107626914978, |
| "learning_rate": 3.1085291702420165e-05, |
| "loss": 0.0375, |
| "num_input_tokens_seen": 419837696, |
| "step": 410000 |
| }, |
| { |
| "epoch": 3.7875642409647448, |
| "grad_norm": 1.189859390258789, |
| "learning_rate": 3.106222492872367e-05, |
| "loss": 0.0373, |
| "num_input_tokens_seen": 420349696, |
| "step": 410500 |
| }, |
| { |
| "epoch": 3.792177595704044, |
| "grad_norm": 0.5523993372917175, |
| "learning_rate": 3.103915815502717e-05, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 420861696, |
| "step": 411000 |
| }, |
| { |
| "epoch": 3.7967909504433432, |
| "grad_norm": 0.6239033341407776, |
| "learning_rate": 3.1016091381330673e-05, |
| "loss": 0.0369, |
| "num_input_tokens_seen": 421373696, |
| "step": 411500 |
| }, |
| { |
| "epoch": 3.8014043051826425, |
| "grad_norm": 2.072326421737671, |
| "learning_rate": 3.099302460763418e-05, |
| "loss": 0.0435, |
| "num_input_tokens_seen": 421885696, |
| "step": 412000 |
| }, |
| { |
| "epoch": 3.806017659921942, |
| "grad_norm": 2.074704647064209, |
| "learning_rate": 3.096995783393769e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 422397696, |
| "step": 412500 |
| }, |
| { |
| "epoch": 3.8106310146612414, |
| "grad_norm": 1.9311884641647339, |
| "learning_rate": 3.094689106024119e-05, |
| "loss": 0.0428, |
| "num_input_tokens_seen": 422909696, |
| "step": 413000 |
| }, |
| { |
| "epoch": 3.8152443694005407, |
| "grad_norm": 1.3210355043411255, |
| "learning_rate": 3.092382428654469e-05, |
| "loss": 0.0429, |
| "num_input_tokens_seen": 423421696, |
| "step": 413500 |
| }, |
| { |
| "epoch": 3.81985772413984, |
| "grad_norm": 3.048222064971924, |
| "learning_rate": 3.09007575128482e-05, |
| "loss": 0.0418, |
| "num_input_tokens_seen": 423933696, |
| "step": 414000 |
| }, |
| { |
| "epoch": 3.824471078879139, |
| "grad_norm": 0.8300300240516663, |
| "learning_rate": 3.08776907391517e-05, |
| "loss": 0.0408, |
| "num_input_tokens_seen": 424445696, |
| "step": 414500 |
| }, |
| { |
| "epoch": 3.829084433618439, |
| "grad_norm": 0.6099697947502136, |
| "learning_rate": 3.0854623965455205e-05, |
| "loss": 0.0453, |
| "num_input_tokens_seen": 424957696, |
| "step": 415000 |
| }, |
| { |
| "epoch": 3.833697788357738, |
| "grad_norm": 1.205819845199585, |
| "learning_rate": 3.0831557191758705e-05, |
| "loss": 0.0379, |
| "num_input_tokens_seen": 425469696, |
| "step": 415500 |
| }, |
| { |
| "epoch": 3.8383111430970374, |
| "grad_norm": 2.9948160648345947, |
| "learning_rate": 3.0808490418062206e-05, |
| "loss": 0.0406, |
| "num_input_tokens_seen": 425981696, |
| "step": 416000 |
| }, |
| { |
| "epoch": 3.8429244978363366, |
| "grad_norm": 1.0202473402023315, |
| "learning_rate": 3.078542364436571e-05, |
| "loss": 0.0446, |
| "num_input_tokens_seen": 426493696, |
| "step": 416500 |
| }, |
| { |
| "epoch": 3.847537852575636, |
| "grad_norm": 1.2540485858917236, |
| "learning_rate": 3.0762356870669214e-05, |
| "loss": 0.0431, |
| "num_input_tokens_seen": 427005696, |
| "step": 417000 |
| }, |
| { |
| "epoch": 3.852151207314935, |
| "grad_norm": 1.10784113407135, |
| "learning_rate": 3.0739290096972714e-05, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 427517696, |
| "step": 417500 |
| }, |
| { |
| "epoch": 3.8567645620542343, |
| "grad_norm": 1.326798439025879, |
| "learning_rate": 3.071622332327622e-05, |
| "loss": 0.0392, |
| "num_input_tokens_seen": 428029696, |
| "step": 418000 |
| }, |
| { |
| "epoch": 3.861377916793534, |
| "grad_norm": 0.7203147411346436, |
| "learning_rate": 3.069315654957973e-05, |
| "loss": 0.0412, |
| "num_input_tokens_seen": 428541696, |
| "step": 418500 |
| }, |
| { |
| "epoch": 3.8659912715328333, |
| "grad_norm": 2.017019510269165, |
| "learning_rate": 3.067008977588323e-05, |
| "loss": 0.0397, |
| "num_input_tokens_seen": 429053696, |
| "step": 419000 |
| }, |
| { |
| "epoch": 3.8706046262721325, |
| "grad_norm": 1.9709299802780151, |
| "learning_rate": 3.064702300218673e-05, |
| "loss": 0.0382, |
| "num_input_tokens_seen": 429565696, |
| "step": 419500 |
| }, |
| { |
| "epoch": 3.875217981011432, |
| "grad_norm": 3.0947420597076416, |
| "learning_rate": 3.062395622849024e-05, |
| "loss": 0.037, |
| "num_input_tokens_seen": 430077696, |
| "step": 420000 |
| }, |
| { |
| "epoch": 3.8798313357507315, |
| "grad_norm": 1.6916519403457642, |
| "learning_rate": 3.060088945479374e-05, |
| "loss": 0.038, |
| "num_input_tokens_seen": 430589696, |
| "step": 420500 |
| }, |
| { |
| "epoch": 3.8844446904900307, |
| "grad_norm": 2.846257209777832, |
| "learning_rate": 3.057782268109724e-05, |
| "loss": 0.0415, |
| "num_input_tokens_seen": 431101696, |
| "step": 421000 |
| }, |
| { |
| "epoch": 3.88905804522933, |
| "grad_norm": 0.8271204233169556, |
| "learning_rate": 3.0554755907400746e-05, |
| "loss": 0.0428, |
| "num_input_tokens_seen": 431613696, |
| "step": 421500 |
| }, |
| { |
| "epoch": 3.893671399968629, |
| "grad_norm": 1.4244275093078613, |
| "learning_rate": 3.0531689133704247e-05, |
| "loss": 0.042, |
| "num_input_tokens_seen": 432125696, |
| "step": 422000 |
| }, |
| { |
| "epoch": 3.8982847547079285, |
| "grad_norm": 1.629799485206604, |
| "learning_rate": 3.050862236000775e-05, |
| "loss": 0.038, |
| "num_input_tokens_seen": 432637696, |
| "step": 422500 |
| }, |
| { |
| "epoch": 3.9028981094472277, |
| "grad_norm": 1.1674317121505737, |
| "learning_rate": 3.0485555586311255e-05, |
| "loss": 0.0408, |
| "num_input_tokens_seen": 433149696, |
| "step": 423000 |
| }, |
| { |
| "epoch": 3.907511464186527, |
| "grad_norm": 0.816435694694519, |
| "learning_rate": 3.046248881261476e-05, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 433661696, |
| "step": 423500 |
| }, |
| { |
| "epoch": 3.9121248189258266, |
| "grad_norm": 0.8461304903030396, |
| "learning_rate": 3.0439422038918262e-05, |
| "loss": 0.0414, |
| "num_input_tokens_seen": 434173696, |
| "step": 424000 |
| }, |
| { |
| "epoch": 3.916738173665126, |
| "grad_norm": 1.0469881296157837, |
| "learning_rate": 3.0416355265221763e-05, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 434685696, |
| "step": 424500 |
| }, |
| { |
| "epoch": 3.921351528404425, |
| "grad_norm": 2.0151569843292236, |
| "learning_rate": 3.0393288491525267e-05, |
| "loss": 0.0411, |
| "num_input_tokens_seen": 435197696, |
| "step": 425000 |
| }, |
| { |
| "epoch": 3.9259648831437244, |
| "grad_norm": 1.178753137588501, |
| "learning_rate": 3.0370221717828774e-05, |
| "loss": 0.0415, |
| "num_input_tokens_seen": 435709696, |
| "step": 425500 |
| }, |
| { |
| "epoch": 3.9305782378830236, |
| "grad_norm": 0.6420595049858093, |
| "learning_rate": 3.034715494413228e-05, |
| "loss": 0.0433, |
| "num_input_tokens_seen": 436221696, |
| "step": 426000 |
| }, |
| { |
| "epoch": 3.9351915926223233, |
| "grad_norm": 1.1695127487182617, |
| "learning_rate": 3.0324088170435776e-05, |
| "loss": 0.0415, |
| "num_input_tokens_seen": 436733696, |
| "step": 426500 |
| }, |
| { |
| "epoch": 3.9398049473616226, |
| "grad_norm": 0.9923868179321289, |
| "learning_rate": 3.0301021396739283e-05, |
| "loss": 0.0412, |
| "num_input_tokens_seen": 437245696, |
| "step": 427000 |
| }, |
| { |
| "epoch": 3.944418302100922, |
| "grad_norm": 0.8079075217247009, |
| "learning_rate": 3.0277954623042787e-05, |
| "loss": 0.0401, |
| "num_input_tokens_seen": 437757696, |
| "step": 427500 |
| }, |
| { |
| "epoch": 3.949031656840221, |
| "grad_norm": 2.699918746948242, |
| "learning_rate": 3.025488784934629e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 438269696, |
| "step": 428000 |
| }, |
| { |
| "epoch": 3.9536450115795203, |
| "grad_norm": 0.577458381652832, |
| "learning_rate": 3.023182107564979e-05, |
| "loss": 0.0404, |
| "num_input_tokens_seen": 438781696, |
| "step": 428500 |
| }, |
| { |
| "epoch": 3.9582583663188196, |
| "grad_norm": 0.6960185170173645, |
| "learning_rate": 3.0208754301953295e-05, |
| "loss": 0.0393, |
| "num_input_tokens_seen": 439293696, |
| "step": 429000 |
| }, |
| { |
| "epoch": 3.962871721058119, |
| "grad_norm": 1.2610116004943848, |
| "learning_rate": 3.01856875282568e-05, |
| "loss": 0.0385, |
| "num_input_tokens_seen": 439805696, |
| "step": 429500 |
| }, |
| { |
| "epoch": 3.9674850757974185, |
| "grad_norm": 1.0515618324279785, |
| "learning_rate": 3.01626207545603e-05, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 440317696, |
| "step": 430000 |
| }, |
| { |
| "epoch": 3.9720984305367177, |
| "grad_norm": 0.9695286154747009, |
| "learning_rate": 3.0139553980863804e-05, |
| "loss": 0.0425, |
| "num_input_tokens_seen": 440829696, |
| "step": 430500 |
| }, |
| { |
| "epoch": 3.976711785276017, |
| "grad_norm": 1.542039155960083, |
| "learning_rate": 3.011648720716731e-05, |
| "loss": 0.0392, |
| "num_input_tokens_seen": 441341696, |
| "step": 431000 |
| }, |
| { |
| "epoch": 3.9813251400153162, |
| "grad_norm": 1.2009466886520386, |
| "learning_rate": 3.0093420433470815e-05, |
| "loss": 0.043, |
| "num_input_tokens_seen": 441853696, |
| "step": 431500 |
| }, |
| { |
| "epoch": 3.985938494754616, |
| "grad_norm": 1.8694528341293335, |
| "learning_rate": 3.0070353659774312e-05, |
| "loss": 0.0396, |
| "num_input_tokens_seen": 442365696, |
| "step": 432000 |
| }, |
| { |
| "epoch": 3.990551849493915, |
| "grad_norm": 1.2931849956512451, |
| "learning_rate": 3.004728688607782e-05, |
| "loss": 0.0382, |
| "num_input_tokens_seen": 442877696, |
| "step": 432500 |
| }, |
| { |
| "epoch": 3.9951652042332144, |
| "grad_norm": 0.953074038028717, |
| "learning_rate": 3.0024220112381324e-05, |
| "loss": 0.0429, |
| "num_input_tokens_seen": 443389696, |
| "step": 433000 |
| }, |
| { |
| "epoch": 3.9997785589725137, |
| "grad_norm": 2.807677745819092, |
| "learning_rate": 3.0001153338684828e-05, |
| "loss": 0.0387, |
| "num_input_tokens_seen": 443901696, |
| "step": 433500 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_combined_score": 0.06748922723897993, |
| "eval_loss": 0.06748922914266586, |
| "eval_mse": 0.06748922533529399, |
| "eval_runtime": 49.5025, |
| "eval_samples_per_second": 1946.123, |
| "eval_steps_per_second": 243.281, |
| "num_input_tokens_seen": 443925504, |
| "step": 433524 |
| }, |
| { |
| "epoch": 4.004391913711813, |
| "grad_norm": 0.2404492050409317, |
| "learning_rate": 2.997808656498833e-05, |
| "loss": 0.0308, |
| "num_input_tokens_seen": 444412928, |
| "step": 434000 |
| }, |
| { |
| "epoch": 4.009005268451112, |
| "grad_norm": 1.2364345788955688, |
| "learning_rate": 2.9955019791291832e-05, |
| "loss": 0.0297, |
| "num_input_tokens_seen": 444924928, |
| "step": 434500 |
| }, |
| { |
| "epoch": 4.013618623190411, |
| "grad_norm": 0.9113791584968567, |
| "learning_rate": 2.9931953017595336e-05, |
| "loss": 0.0287, |
| "num_input_tokens_seen": 445436928, |
| "step": 435000 |
| }, |
| { |
| "epoch": 4.018231977929711, |
| "grad_norm": 1.880218267440796, |
| "learning_rate": 2.9908886243898837e-05, |
| "loss": 0.0294, |
| "num_input_tokens_seen": 445948928, |
| "step": 435500 |
| }, |
| { |
| "epoch": 4.02284533266901, |
| "grad_norm": 1.7842798233032227, |
| "learning_rate": 2.988581947020234e-05, |
| "loss": 0.0288, |
| "num_input_tokens_seen": 446460928, |
| "step": 436000 |
| }, |
| { |
| "epoch": 4.027458687408309, |
| "grad_norm": 0.5358702540397644, |
| "learning_rate": 2.9862752696505848e-05, |
| "loss": 0.0305, |
| "num_input_tokens_seen": 446972928, |
| "step": 436500 |
| }, |
| { |
| "epoch": 4.032072042147609, |
| "grad_norm": 0.7529350519180298, |
| "learning_rate": 2.9839685922809352e-05, |
| "loss": 0.029, |
| "num_input_tokens_seen": 447484928, |
| "step": 437000 |
| }, |
| { |
| "epoch": 4.0366853968869085, |
| "grad_norm": 0.6187124848365784, |
| "learning_rate": 2.981661914911285e-05, |
| "loss": 0.0303, |
| "num_input_tokens_seen": 447996928, |
| "step": 437500 |
| }, |
| { |
| "epoch": 4.041298751626208, |
| "grad_norm": 1.1267274618148804, |
| "learning_rate": 2.9793552375416357e-05, |
| "loss": 0.0292, |
| "num_input_tokens_seen": 448508928, |
| "step": 438000 |
| }, |
| { |
| "epoch": 4.045912106365507, |
| "grad_norm": 1.6049976348876953, |
| "learning_rate": 2.977048560171986e-05, |
| "loss": 0.0292, |
| "num_input_tokens_seen": 449020928, |
| "step": 438500 |
| }, |
| { |
| "epoch": 4.050525461104806, |
| "grad_norm": 3.9203622341156006, |
| "learning_rate": 2.9747418828023365e-05, |
| "loss": 0.0312, |
| "num_input_tokens_seen": 449532928, |
| "step": 439000 |
| }, |
| { |
| "epoch": 4.0551388158441055, |
| "grad_norm": 0.6487706899642944, |
| "learning_rate": 2.9724352054326865e-05, |
| "loss": 0.029, |
| "num_input_tokens_seen": 450044928, |
| "step": 439500 |
| }, |
| { |
| "epoch": 4.059752170583405, |
| "grad_norm": 0.9871296882629395, |
| "learning_rate": 2.970128528063037e-05, |
| "loss": 0.0299, |
| "num_input_tokens_seen": 450556928, |
| "step": 440000 |
| }, |
| { |
| "epoch": 4.064365525322704, |
| "grad_norm": 0.4027337431907654, |
| "learning_rate": 2.9678218506933873e-05, |
| "loss": 0.0287, |
| "num_input_tokens_seen": 451068928, |
| "step": 440500 |
| }, |
| { |
| "epoch": 4.068978880062003, |
| "grad_norm": 1.1440553665161133, |
| "learning_rate": 2.965515173323738e-05, |
| "loss": 0.0313, |
| "num_input_tokens_seen": 451580928, |
| "step": 441000 |
| }, |
| { |
| "epoch": 4.0735922348013025, |
| "grad_norm": 0.5619149208068848, |
| "learning_rate": 2.9632084959540878e-05, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 452092928, |
| "step": 441500 |
| }, |
| { |
| "epoch": 4.078205589540602, |
| "grad_norm": 3.5681047439575195, |
| "learning_rate": 2.9609018185844385e-05, |
| "loss": 0.0301, |
| "num_input_tokens_seen": 452604928, |
| "step": 442000 |
| }, |
| { |
| "epoch": 4.082818944279902, |
| "grad_norm": 1.2567273378372192, |
| "learning_rate": 2.958595141214789e-05, |
| "loss": 0.0317, |
| "num_input_tokens_seen": 453116928, |
| "step": 442500 |
| }, |
| { |
| "epoch": 4.087432299019201, |
| "grad_norm": 1.553036093711853, |
| "learning_rate": 2.956288463845139e-05, |
| "loss": 0.0296, |
| "num_input_tokens_seen": 453628928, |
| "step": 443000 |
| }, |
| { |
| "epoch": 4.0920456537585, |
| "grad_norm": 0.8509573340415955, |
| "learning_rate": 2.9539817864754894e-05, |
| "loss": 0.0325, |
| "num_input_tokens_seen": 454140928, |
| "step": 443500 |
| }, |
| { |
| "epoch": 4.0966590084978, |
| "grad_norm": 1.0355197191238403, |
| "learning_rate": 2.9516751091058398e-05, |
| "loss": 0.0346, |
| "num_input_tokens_seen": 454652928, |
| "step": 444000 |
| }, |
| { |
| "epoch": 4.101272363237099, |
| "grad_norm": 1.49540376663208, |
| "learning_rate": 2.94936843173619e-05, |
| "loss": 0.0335, |
| "num_input_tokens_seen": 455164928, |
| "step": 444500 |
| }, |
| { |
| "epoch": 4.105885717976398, |
| "grad_norm": 1.6079996824264526, |
| "learning_rate": 2.9470617543665402e-05, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 455676928, |
| "step": 445000 |
| }, |
| { |
| "epoch": 4.110499072715697, |
| "grad_norm": 0.5073397159576416, |
| "learning_rate": 2.9447550769968906e-05, |
| "loss": 0.0308, |
| "num_input_tokens_seen": 456188928, |
| "step": 445500 |
| }, |
| { |
| "epoch": 4.115112427454997, |
| "grad_norm": 1.6608948707580566, |
| "learning_rate": 2.942448399627241e-05, |
| "loss": 0.0302, |
| "num_input_tokens_seen": 456700928, |
| "step": 446000 |
| }, |
| { |
| "epoch": 4.119725782194296, |
| "grad_norm": 0.9647392630577087, |
| "learning_rate": 2.9401417222575917e-05, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 457212928, |
| "step": 446500 |
| }, |
| { |
| "epoch": 4.124339136933595, |
| "grad_norm": 0.6390677690505981, |
| "learning_rate": 2.9378350448879415e-05, |
| "loss": 0.0305, |
| "num_input_tokens_seen": 457724928, |
| "step": 447000 |
| }, |
| { |
| "epoch": 4.128952491672894, |
| "grad_norm": 1.7215697765350342, |
| "learning_rate": 2.9355283675182922e-05, |
| "loss": 0.0328, |
| "num_input_tokens_seen": 458236928, |
| "step": 447500 |
| }, |
| { |
| "epoch": 4.133565846412194, |
| "grad_norm": 1.1551854610443115, |
| "learning_rate": 2.9332216901486426e-05, |
| "loss": 0.0313, |
| "num_input_tokens_seen": 458748928, |
| "step": 448000 |
| }, |
| { |
| "epoch": 4.138179201151494, |
| "grad_norm": 1.6345293521881104, |
| "learning_rate": 2.9309150127789927e-05, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 459260928, |
| "step": 448500 |
| }, |
| { |
| "epoch": 4.142792555890793, |
| "grad_norm": 1.5224887132644653, |
| "learning_rate": 2.928608335409343e-05, |
| "loss": 0.0307, |
| "num_input_tokens_seen": 459772928, |
| "step": 449000 |
| }, |
| { |
| "epoch": 4.147405910630092, |
| "grad_norm": 1.6716899871826172, |
| "learning_rate": 2.9263016580396934e-05, |
| "loss": 0.0346, |
| "num_input_tokens_seen": 460284928, |
| "step": 449500 |
| }, |
| { |
| "epoch": 4.1520192653693915, |
| "grad_norm": 2.299623489379883, |
| "learning_rate": 2.923994980670044e-05, |
| "loss": 0.0301, |
| "num_input_tokens_seen": 460796928, |
| "step": 450000 |
| }, |
| { |
| "epoch": 4.156632620108691, |
| "grad_norm": 0.7651464343070984, |
| "learning_rate": 2.921688303300394e-05, |
| "loss": 0.0308, |
| "num_input_tokens_seen": 461308928, |
| "step": 450500 |
| }, |
| { |
| "epoch": 4.16124597484799, |
| "grad_norm": 1.1913387775421143, |
| "learning_rate": 2.9193816259307443e-05, |
| "loss": 0.0312, |
| "num_input_tokens_seen": 461820928, |
| "step": 451000 |
| }, |
| { |
| "epoch": 4.165859329587289, |
| "grad_norm": 1.0334786176681519, |
| "learning_rate": 2.9170749485610947e-05, |
| "loss": 0.0335, |
| "num_input_tokens_seen": 462332928, |
| "step": 451500 |
| }, |
| { |
| "epoch": 4.1704726843265885, |
| "grad_norm": 1.9780852794647217, |
| "learning_rate": 2.9147682711914454e-05, |
| "loss": 0.0344, |
| "num_input_tokens_seen": 462844928, |
| "step": 452000 |
| }, |
| { |
| "epoch": 4.175086039065888, |
| "grad_norm": 0.8200696706771851, |
| "learning_rate": 2.912461593821795e-05, |
| "loss": 0.033, |
| "num_input_tokens_seen": 463356928, |
| "step": 452500 |
| }, |
| { |
| "epoch": 4.179699393805187, |
| "grad_norm": 1.0019230842590332, |
| "learning_rate": 2.910154916452146e-05, |
| "loss": 0.0303, |
| "num_input_tokens_seen": 463868928, |
| "step": 453000 |
| }, |
| { |
| "epoch": 4.184312748544486, |
| "grad_norm": 2.18719744682312, |
| "learning_rate": 2.9078482390824963e-05, |
| "loss": 0.03, |
| "num_input_tokens_seen": 464380928, |
| "step": 453500 |
| }, |
| { |
| "epoch": 4.1889261032837855, |
| "grad_norm": 1.2453852891921997, |
| "learning_rate": 2.9055415617128467e-05, |
| "loss": 0.0306, |
| "num_input_tokens_seen": 464892928, |
| "step": 454000 |
| }, |
| { |
| "epoch": 4.193539458023086, |
| "grad_norm": 2.0544652938842773, |
| "learning_rate": 2.9032348843431967e-05, |
| "loss": 0.0331, |
| "num_input_tokens_seen": 465404928, |
| "step": 454500 |
| }, |
| { |
| "epoch": 4.198152812762385, |
| "grad_norm": 5.509039878845215, |
| "learning_rate": 2.900928206973547e-05, |
| "loss": 0.0308, |
| "num_input_tokens_seen": 465916928, |
| "step": 455000 |
| }, |
| { |
| "epoch": 4.202766167501684, |
| "grad_norm": 0.6365485787391663, |
| "learning_rate": 2.8986215296038975e-05, |
| "loss": 0.0322, |
| "num_input_tokens_seen": 466428928, |
| "step": 455500 |
| }, |
| { |
| "epoch": 4.207379522240983, |
| "grad_norm": 0.8369764685630798, |
| "learning_rate": 2.8963148522342476e-05, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 466940928, |
| "step": 456000 |
| }, |
| { |
| "epoch": 4.211992876980283, |
| "grad_norm": 1.3454687595367432, |
| "learning_rate": 2.894008174864598e-05, |
| "loss": 0.0317, |
| "num_input_tokens_seen": 467452928, |
| "step": 456500 |
| }, |
| { |
| "epoch": 4.216606231719582, |
| "grad_norm": 1.042900800704956, |
| "learning_rate": 2.8917014974949487e-05, |
| "loss": 0.0304, |
| "num_input_tokens_seen": 467964928, |
| "step": 457000 |
| }, |
| { |
| "epoch": 4.221219586458881, |
| "grad_norm": 2.2044434547424316, |
| "learning_rate": 2.889394820125299e-05, |
| "loss": 0.0309, |
| "num_input_tokens_seen": 468476928, |
| "step": 457500 |
| }, |
| { |
| "epoch": 4.22583294119818, |
| "grad_norm": 1.4156602621078491, |
| "learning_rate": 2.887088142755649e-05, |
| "loss": 0.0325, |
| "num_input_tokens_seen": 468988928, |
| "step": 458000 |
| }, |
| { |
| "epoch": 4.23044629593748, |
| "grad_norm": 1.4290229082107544, |
| "learning_rate": 2.8847814653859996e-05, |
| "loss": 0.034, |
| "num_input_tokens_seen": 469500928, |
| "step": 458500 |
| }, |
| { |
| "epoch": 4.235059650676779, |
| "grad_norm": 0.8856704235076904, |
| "learning_rate": 2.88247478801635e-05, |
| "loss": 0.0301, |
| "num_input_tokens_seen": 470012928, |
| "step": 459000 |
| }, |
| { |
| "epoch": 4.239673005416078, |
| "grad_norm": 1.0637128353118896, |
| "learning_rate": 2.8801681106467004e-05, |
| "loss": 0.0315, |
| "num_input_tokens_seen": 470524928, |
| "step": 459500 |
| }, |
| { |
| "epoch": 4.244286360155378, |
| "grad_norm": 0.9506544470787048, |
| "learning_rate": 2.8778614332770504e-05, |
| "loss": 0.0298, |
| "num_input_tokens_seen": 471036928, |
| "step": 460000 |
| }, |
| { |
| "epoch": 4.248899714894677, |
| "grad_norm": 1.05034339427948, |
| "learning_rate": 2.8755547559074008e-05, |
| "loss": 0.0331, |
| "num_input_tokens_seen": 471548928, |
| "step": 460500 |
| }, |
| { |
| "epoch": 4.253513069633977, |
| "grad_norm": 1.1537014245986938, |
| "learning_rate": 2.8732480785377512e-05, |
| "loss": 0.0309, |
| "num_input_tokens_seen": 472060928, |
| "step": 461000 |
| }, |
| { |
| "epoch": 4.258126424373276, |
| "grad_norm": 0.42139768600463867, |
| "learning_rate": 2.8709414011681013e-05, |
| "loss": 0.032, |
| "num_input_tokens_seen": 472572928, |
| "step": 461500 |
| }, |
| { |
| "epoch": 4.262739779112575, |
| "grad_norm": 2.2188069820404053, |
| "learning_rate": 2.8686347237984517e-05, |
| "loss": 0.0301, |
| "num_input_tokens_seen": 473084928, |
| "step": 462000 |
| }, |
| { |
| "epoch": 4.267353133851874, |
| "grad_norm": 1.293926477432251, |
| "learning_rate": 2.8663280464288024e-05, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 473596928, |
| "step": 462500 |
| }, |
| { |
| "epoch": 4.271966488591174, |
| "grad_norm": 1.7295567989349365, |
| "learning_rate": 2.8640213690591528e-05, |
| "loss": 0.0286, |
| "num_input_tokens_seen": 474108928, |
| "step": 463000 |
| }, |
| { |
| "epoch": 4.276579843330473, |
| "grad_norm": 1.3442994356155396, |
| "learning_rate": 2.8617146916895025e-05, |
| "loss": 0.0312, |
| "num_input_tokens_seen": 474620928, |
| "step": 463500 |
| }, |
| { |
| "epoch": 4.281193198069772, |
| "grad_norm": 1.4000321626663208, |
| "learning_rate": 2.8594080143198533e-05, |
| "loss": 0.0326, |
| "num_input_tokens_seen": 475132928, |
| "step": 464000 |
| }, |
| { |
| "epoch": 4.285806552809071, |
| "grad_norm": 1.4646140336990356, |
| "learning_rate": 2.8571013369502037e-05, |
| "loss": 0.0316, |
| "num_input_tokens_seen": 475644928, |
| "step": 464500 |
| }, |
| { |
| "epoch": 4.290419907548371, |
| "grad_norm": 1.296420931816101, |
| "learning_rate": 2.854794659580554e-05, |
| "loss": 0.0328, |
| "num_input_tokens_seen": 476156928, |
| "step": 465000 |
| }, |
| { |
| "epoch": 4.295033262287671, |
| "grad_norm": 0.947172999382019, |
| "learning_rate": 2.852487982210904e-05, |
| "loss": 0.031, |
| "num_input_tokens_seen": 476668928, |
| "step": 465500 |
| }, |
| { |
| "epoch": 4.29964661702697, |
| "grad_norm": 0.6631402969360352, |
| "learning_rate": 2.8501813048412545e-05, |
| "loss": 0.0291, |
| "num_input_tokens_seen": 477180928, |
| "step": 466000 |
| }, |
| { |
| "epoch": 4.304259971766269, |
| "grad_norm": 0.5878441333770752, |
| "learning_rate": 2.847874627471605e-05, |
| "loss": 0.0316, |
| "num_input_tokens_seen": 477692928, |
| "step": 466500 |
| }, |
| { |
| "epoch": 4.3088733265055685, |
| "grad_norm": 1.32041335105896, |
| "learning_rate": 2.8455679501019557e-05, |
| "loss": 0.0296, |
| "num_input_tokens_seen": 478204928, |
| "step": 467000 |
| }, |
| { |
| "epoch": 4.313486681244868, |
| "grad_norm": 0.7355374097824097, |
| "learning_rate": 2.8432612727323054e-05, |
| "loss": 0.0322, |
| "num_input_tokens_seen": 478716928, |
| "step": 467500 |
| }, |
| { |
| "epoch": 4.318100035984167, |
| "grad_norm": 0.5715786218643188, |
| "learning_rate": 2.840954595362656e-05, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 479228928, |
| "step": 468000 |
| }, |
| { |
| "epoch": 4.322713390723466, |
| "grad_norm": 0.873299777507782, |
| "learning_rate": 2.8386479179930065e-05, |
| "loss": 0.0341, |
| "num_input_tokens_seen": 479740928, |
| "step": 468500 |
| }, |
| { |
| "epoch": 4.3273267454627655, |
| "grad_norm": 0.4993022382259369, |
| "learning_rate": 2.8363412406233562e-05, |
| "loss": 0.0347, |
| "num_input_tokens_seen": 480252928, |
| "step": 469000 |
| }, |
| { |
| "epoch": 4.331940100202065, |
| "grad_norm": 1.0970638990402222, |
| "learning_rate": 2.834034563253707e-05, |
| "loss": 0.0297, |
| "num_input_tokens_seen": 480764928, |
| "step": 469500 |
| }, |
| { |
| "epoch": 4.336553454941364, |
| "grad_norm": 1.030454158782959, |
| "learning_rate": 2.8317278858840574e-05, |
| "loss": 0.0309, |
| "num_input_tokens_seen": 481276928, |
| "step": 470000 |
| }, |
| { |
| "epoch": 4.341166809680663, |
| "grad_norm": 2.224727153778076, |
| "learning_rate": 2.8294212085144078e-05, |
| "loss": 0.0319, |
| "num_input_tokens_seen": 481788928, |
| "step": 470500 |
| }, |
| { |
| "epoch": 4.3457801644199625, |
| "grad_norm": 0.8922818899154663, |
| "learning_rate": 2.8271145311447578e-05, |
| "loss": 0.0324, |
| "num_input_tokens_seen": 482300928, |
| "step": 471000 |
| }, |
| { |
| "epoch": 4.350393519159263, |
| "grad_norm": 1.355394721031189, |
| "learning_rate": 2.8248078537751082e-05, |
| "loss": 0.0322, |
| "num_input_tokens_seen": 482812928, |
| "step": 471500 |
| }, |
| { |
| "epoch": 4.355006873898562, |
| "grad_norm": 1.3697582483291626, |
| "learning_rate": 2.8225011764054586e-05, |
| "loss": 0.0307, |
| "num_input_tokens_seen": 483324928, |
| "step": 472000 |
| }, |
| { |
| "epoch": 4.359620228637861, |
| "grad_norm": 0.8543123006820679, |
| "learning_rate": 2.8201944990358093e-05, |
| "loss": 0.0308, |
| "num_input_tokens_seen": 483836928, |
| "step": 472500 |
| }, |
| { |
| "epoch": 4.36423358337716, |
| "grad_norm": 1.2586286067962646, |
| "learning_rate": 2.817887821666159e-05, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 484348928, |
| "step": 473000 |
| }, |
| { |
| "epoch": 4.36884693811646, |
| "grad_norm": 1.0295668840408325, |
| "learning_rate": 2.8155811442965098e-05, |
| "loss": 0.0324, |
| "num_input_tokens_seen": 484860928, |
| "step": 473500 |
| }, |
| { |
| "epoch": 4.373460292855759, |
| "grad_norm": 1.3368573188781738, |
| "learning_rate": 2.8132744669268602e-05, |
| "loss": 0.0303, |
| "num_input_tokens_seen": 485372928, |
| "step": 474000 |
| }, |
| { |
| "epoch": 4.378073647595058, |
| "grad_norm": 0.5129613280296326, |
| "learning_rate": 2.81096778955721e-05, |
| "loss": 0.031, |
| "num_input_tokens_seen": 485884928, |
| "step": 474500 |
| }, |
| { |
| "epoch": 4.382687002334357, |
| "grad_norm": 0.7094746828079224, |
| "learning_rate": 2.8086611121875606e-05, |
| "loss": 0.0305, |
| "num_input_tokens_seen": 486396928, |
| "step": 475000 |
| }, |
| { |
| "epoch": 4.387300357073657, |
| "grad_norm": 1.2379733324050903, |
| "learning_rate": 2.806354434817911e-05, |
| "loss": 0.035, |
| "num_input_tokens_seen": 486908928, |
| "step": 475500 |
| }, |
| { |
| "epoch": 4.391913711812956, |
| "grad_norm": 0.9573284387588501, |
| "learning_rate": 2.8040477574482614e-05, |
| "loss": 0.0321, |
| "num_input_tokens_seen": 487420928, |
| "step": 476000 |
| }, |
| { |
| "epoch": 4.396527066552255, |
| "grad_norm": 0.8460474014282227, |
| "learning_rate": 2.8017410800786115e-05, |
| "loss": 0.032, |
| "num_input_tokens_seen": 487932928, |
| "step": 476500 |
| }, |
| { |
| "epoch": 4.401140421291554, |
| "grad_norm": 0.5795192122459412, |
| "learning_rate": 2.799434402708962e-05, |
| "loss": 0.0331, |
| "num_input_tokens_seen": 488444928, |
| "step": 477000 |
| }, |
| { |
| "epoch": 4.4057537760308545, |
| "grad_norm": 2.4742841720581055, |
| "learning_rate": 2.7971277253393123e-05, |
| "loss": 0.0287, |
| "num_input_tokens_seen": 488956928, |
| "step": 477500 |
| }, |
| { |
| "epoch": 4.410367130770154, |
| "grad_norm": 2.2295806407928467, |
| "learning_rate": 2.794821047969663e-05, |
| "loss": 0.0343, |
| "num_input_tokens_seen": 489468928, |
| "step": 478000 |
| }, |
| { |
| "epoch": 4.414980485509453, |
| "grad_norm": 1.4073495864868164, |
| "learning_rate": 2.7925143706000128e-05, |
| "loss": 0.0335, |
| "num_input_tokens_seen": 489980928, |
| "step": 478500 |
| }, |
| { |
| "epoch": 4.419593840248752, |
| "grad_norm": 1.378461480140686, |
| "learning_rate": 2.7902076932303635e-05, |
| "loss": 0.0343, |
| "num_input_tokens_seen": 490492928, |
| "step": 479000 |
| }, |
| { |
| "epoch": 4.4242071949880515, |
| "grad_norm": 0.6204975247383118, |
| "learning_rate": 2.787901015860714e-05, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 491004928, |
| "step": 479500 |
| }, |
| { |
| "epoch": 4.428820549727351, |
| "grad_norm": 1.0409677028656006, |
| "learning_rate": 2.7855943384910643e-05, |
| "loss": 0.0325, |
| "num_input_tokens_seen": 491516928, |
| "step": 480000 |
| }, |
| { |
| "epoch": 4.43343390446665, |
| "grad_norm": 1.2104921340942383, |
| "learning_rate": 2.7832876611214143e-05, |
| "loss": 0.0339, |
| "num_input_tokens_seen": 492028928, |
| "step": 480500 |
| }, |
| { |
| "epoch": 4.438047259205949, |
| "grad_norm": 2.0074825286865234, |
| "learning_rate": 2.7809809837517647e-05, |
| "loss": 0.0322, |
| "num_input_tokens_seen": 492540928, |
| "step": 481000 |
| }, |
| { |
| "epoch": 4.4426606139452485, |
| "grad_norm": 0.8541880249977112, |
| "learning_rate": 2.778674306382115e-05, |
| "loss": 0.0299, |
| "num_input_tokens_seen": 493052928, |
| "step": 481500 |
| }, |
| { |
| "epoch": 4.447273968684548, |
| "grad_norm": 2.382373332977295, |
| "learning_rate": 2.7763676290124652e-05, |
| "loss": 0.0303, |
| "num_input_tokens_seen": 493564928, |
| "step": 482000 |
| }, |
| { |
| "epoch": 4.451887323423847, |
| "grad_norm": 0.8820599317550659, |
| "learning_rate": 2.7740609516428156e-05, |
| "loss": 0.0306, |
| "num_input_tokens_seen": 494076928, |
| "step": 482500 |
| }, |
| { |
| "epoch": 4.456500678163147, |
| "grad_norm": 0.6329056620597839, |
| "learning_rate": 2.771754274273166e-05, |
| "loss": 0.0313, |
| "num_input_tokens_seen": 494588928, |
| "step": 483000 |
| }, |
| { |
| "epoch": 4.461114032902446, |
| "grad_norm": 0.7391223311424255, |
| "learning_rate": 2.7694475969035167e-05, |
| "loss": 0.0342, |
| "num_input_tokens_seen": 495100928, |
| "step": 483500 |
| }, |
| { |
| "epoch": 4.465727387641746, |
| "grad_norm": 0.6143118143081665, |
| "learning_rate": 2.7671409195338664e-05, |
| "loss": 0.0324, |
| "num_input_tokens_seen": 495612928, |
| "step": 484000 |
| }, |
| { |
| "epoch": 4.470340742381045, |
| "grad_norm": 2.01242733001709, |
| "learning_rate": 2.7648342421642172e-05, |
| "loss": 0.029, |
| "num_input_tokens_seen": 496124928, |
| "step": 484500 |
| }, |
| { |
| "epoch": 4.474954097120344, |
| "grad_norm": 0.9278964996337891, |
| "learning_rate": 2.7625275647945676e-05, |
| "loss": 0.0362, |
| "num_input_tokens_seen": 496636928, |
| "step": 485000 |
| }, |
| { |
| "epoch": 4.479567451859643, |
| "grad_norm": 1.0499247312545776, |
| "learning_rate": 2.760220887424918e-05, |
| "loss": 0.033, |
| "num_input_tokens_seen": 497148928, |
| "step": 485500 |
| }, |
| { |
| "epoch": 4.484180806598943, |
| "grad_norm": 1.7017521858215332, |
| "learning_rate": 2.757914210055268e-05, |
| "loss": 0.0304, |
| "num_input_tokens_seen": 497660928, |
| "step": 486000 |
| }, |
| { |
| "epoch": 4.488794161338242, |
| "grad_norm": 2.3478429317474365, |
| "learning_rate": 2.7556075326856184e-05, |
| "loss": 0.0329, |
| "num_input_tokens_seen": 498172928, |
| "step": 486500 |
| }, |
| { |
| "epoch": 4.493407516077541, |
| "grad_norm": 3.133190155029297, |
| "learning_rate": 2.7533008553159688e-05, |
| "loss": 0.0341, |
| "num_input_tokens_seen": 498684928, |
| "step": 487000 |
| }, |
| { |
| "epoch": 4.49802087081684, |
| "grad_norm": 0.5625250339508057, |
| "learning_rate": 2.750994177946319e-05, |
| "loss": 0.0328, |
| "num_input_tokens_seen": 499196928, |
| "step": 487500 |
| }, |
| { |
| "epoch": 4.50263422555614, |
| "grad_norm": 1.0259020328521729, |
| "learning_rate": 2.7486875005766693e-05, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 499708928, |
| "step": 488000 |
| }, |
| { |
| "epoch": 4.50724758029544, |
| "grad_norm": 0.48490577936172485, |
| "learning_rate": 2.7463808232070197e-05, |
| "loss": 0.0335, |
| "num_input_tokens_seen": 500220928, |
| "step": 488500 |
| }, |
| { |
| "epoch": 4.511860935034738, |
| "grad_norm": 0.40793031454086304, |
| "learning_rate": 2.7440741458373704e-05, |
| "loss": 0.0312, |
| "num_input_tokens_seen": 500732928, |
| "step": 489000 |
| }, |
| { |
| "epoch": 4.516474289774038, |
| "grad_norm": 1.1319341659545898, |
| "learning_rate": 2.74176746846772e-05, |
| "loss": 0.0327, |
| "num_input_tokens_seen": 501244928, |
| "step": 489500 |
| }, |
| { |
| "epoch": 4.5210876445133374, |
| "grad_norm": 1.9659985303878784, |
| "learning_rate": 2.739460791098071e-05, |
| "loss": 0.0338, |
| "num_input_tokens_seen": 501756928, |
| "step": 490000 |
| }, |
| { |
| "epoch": 4.525700999252637, |
| "grad_norm": 0.5315821766853333, |
| "learning_rate": 2.7371541137284213e-05, |
| "loss": 0.0336, |
| "num_input_tokens_seen": 502268928, |
| "step": 490500 |
| }, |
| { |
| "epoch": 4.530314353991936, |
| "grad_norm": 0.47908708453178406, |
| "learning_rate": 2.7348474363587717e-05, |
| "loss": 0.0295, |
| "num_input_tokens_seen": 502780928, |
| "step": 491000 |
| }, |
| { |
| "epoch": 4.534927708731235, |
| "grad_norm": 0.9557788968086243, |
| "learning_rate": 2.7325407589891217e-05, |
| "loss": 0.0314, |
| "num_input_tokens_seen": 503292928, |
| "step": 491500 |
| }, |
| { |
| "epoch": 4.539541063470534, |
| "grad_norm": 1.229929804801941, |
| "learning_rate": 2.730234081619472e-05, |
| "loss": 0.03, |
| "num_input_tokens_seen": 503804928, |
| "step": 492000 |
| }, |
| { |
| "epoch": 4.544154418209834, |
| "grad_norm": 2.0131001472473145, |
| "learning_rate": 2.7279274042498225e-05, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 504316928, |
| "step": 492500 |
| }, |
| { |
| "epoch": 4.548767772949133, |
| "grad_norm": 1.8093568086624146, |
| "learning_rate": 2.7256207268801732e-05, |
| "loss": 0.0315, |
| "num_input_tokens_seen": 504828928, |
| "step": 493000 |
| }, |
| { |
| "epoch": 4.553381127688432, |
| "grad_norm": 3.043375253677368, |
| "learning_rate": 2.723314049510523e-05, |
| "loss": 0.0336, |
| "num_input_tokens_seen": 505340928, |
| "step": 493500 |
| }, |
| { |
| "epoch": 4.557994482427731, |
| "grad_norm": 1.5375556945800781, |
| "learning_rate": 2.7210073721408734e-05, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 505852928, |
| "step": 494000 |
| }, |
| { |
| "epoch": 4.562607837167031, |
| "grad_norm": 1.2980600595474243, |
| "learning_rate": 2.718700694771224e-05, |
| "loss": 0.0315, |
| "num_input_tokens_seen": 506364928, |
| "step": 494500 |
| }, |
| { |
| "epoch": 4.567221191906331, |
| "grad_norm": 1.3334441184997559, |
| "learning_rate": 2.7163940174015738e-05, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 506876928, |
| "step": 495000 |
| }, |
| { |
| "epoch": 4.57183454664563, |
| "grad_norm": 10.070221900939941, |
| "learning_rate": 2.7140873400319246e-05, |
| "loss": 0.034, |
| "num_input_tokens_seen": 507388928, |
| "step": 495500 |
| }, |
| { |
| "epoch": 4.576447901384929, |
| "grad_norm": 9.152368545532227, |
| "learning_rate": 2.711780662662275e-05, |
| "loss": 0.0316, |
| "num_input_tokens_seen": 507900928, |
| "step": 496000 |
| }, |
| { |
| "epoch": 4.5810612561242285, |
| "grad_norm": 2.569089651107788, |
| "learning_rate": 2.7094739852926254e-05, |
| "loss": 0.0336, |
| "num_input_tokens_seen": 508412928, |
| "step": 496500 |
| }, |
| { |
| "epoch": 4.585674610863528, |
| "grad_norm": 0.7014693021774292, |
| "learning_rate": 2.7071673079229754e-05, |
| "loss": 0.0316, |
| "num_input_tokens_seen": 508924928, |
| "step": 497000 |
| }, |
| { |
| "epoch": 4.590287965602827, |
| "grad_norm": 1.182787537574768, |
| "learning_rate": 2.7048606305533258e-05, |
| "loss": 0.0315, |
| "num_input_tokens_seen": 509436928, |
| "step": 497500 |
| }, |
| { |
| "epoch": 4.594901320342126, |
| "grad_norm": 0.6506703495979309, |
| "learning_rate": 2.7025539531836762e-05, |
| "loss": 0.0328, |
| "num_input_tokens_seen": 509948928, |
| "step": 498000 |
| }, |
| { |
| "epoch": 4.5995146750814255, |
| "grad_norm": 0.5681861639022827, |
| "learning_rate": 2.700247275814027e-05, |
| "loss": 0.0343, |
| "num_input_tokens_seen": 510460928, |
| "step": 498500 |
| }, |
| { |
| "epoch": 4.604128029820725, |
| "grad_norm": 1.2895385026931763, |
| "learning_rate": 2.6979405984443767e-05, |
| "loss": 0.0332, |
| "num_input_tokens_seen": 510972928, |
| "step": 499000 |
| }, |
| { |
| "epoch": 4.608741384560024, |
| "grad_norm": 1.2549630403518677, |
| "learning_rate": 2.6956339210747274e-05, |
| "loss": 0.032, |
| "num_input_tokens_seen": 511484928, |
| "step": 499500 |
| }, |
| { |
| "epoch": 4.613354739299323, |
| "grad_norm": 1.486061692237854, |
| "learning_rate": 2.6933272437050778e-05, |
| "loss": 0.0331, |
| "num_input_tokens_seen": 511996928, |
| "step": 500000 |
| }, |
| { |
| "epoch": 4.617968094038623, |
| "grad_norm": 1.0897846221923828, |
| "learning_rate": 2.6910205663354275e-05, |
| "loss": 0.0327, |
| "num_input_tokens_seen": 512508928, |
| "step": 500500 |
| }, |
| { |
| "epoch": 4.622581448777923, |
| "grad_norm": 0.9600527286529541, |
| "learning_rate": 2.6887138889657782e-05, |
| "loss": 0.0326, |
| "num_input_tokens_seen": 513020928, |
| "step": 501000 |
| }, |
| { |
| "epoch": 4.627194803517222, |
| "grad_norm": 3.943963050842285, |
| "learning_rate": 2.6864072115961286e-05, |
| "loss": 0.0283, |
| "num_input_tokens_seen": 513532928, |
| "step": 501500 |
| }, |
| { |
| "epoch": 4.631808158256521, |
| "grad_norm": 1.1537055969238281, |
| "learning_rate": 2.684100534226479e-05, |
| "loss": 0.0353, |
| "num_input_tokens_seen": 514044928, |
| "step": 502000 |
| }, |
| { |
| "epoch": 4.63642151299582, |
| "grad_norm": 2.200751543045044, |
| "learning_rate": 2.681793856856829e-05, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 514556928, |
| "step": 502500 |
| }, |
| { |
| "epoch": 4.64103486773512, |
| "grad_norm": 1.1844205856323242, |
| "learning_rate": 2.6794871794871795e-05, |
| "loss": 0.0308, |
| "num_input_tokens_seen": 515068928, |
| "step": 503000 |
| }, |
| { |
| "epoch": 4.645648222474419, |
| "grad_norm": 4.328240871429443, |
| "learning_rate": 2.67718050211753e-05, |
| "loss": 0.0337, |
| "num_input_tokens_seen": 515580928, |
| "step": 503500 |
| }, |
| { |
| "epoch": 4.650261577213718, |
| "grad_norm": 1.1905447244644165, |
| "learning_rate": 2.6748738247478806e-05, |
| "loss": 0.0335, |
| "num_input_tokens_seen": 516092928, |
| "step": 504000 |
| }, |
| { |
| "epoch": 4.654874931953017, |
| "grad_norm": 0.4069402813911438, |
| "learning_rate": 2.6725671473782303e-05, |
| "loss": 0.034, |
| "num_input_tokens_seen": 516604928, |
| "step": 504500 |
| }, |
| { |
| "epoch": 4.659488286692317, |
| "grad_norm": 0.7860555648803711, |
| "learning_rate": 2.670260470008581e-05, |
| "loss": 0.034, |
| "num_input_tokens_seen": 517116928, |
| "step": 505000 |
| }, |
| { |
| "epoch": 4.664101641431616, |
| "grad_norm": 0.5769841074943542, |
| "learning_rate": 2.6679537926389315e-05, |
| "loss": 0.033, |
| "num_input_tokens_seen": 517628928, |
| "step": 505500 |
| }, |
| { |
| "epoch": 4.668714996170916, |
| "grad_norm": 1.5153945684432983, |
| "learning_rate": 2.665647115269282e-05, |
| "loss": 0.031, |
| "num_input_tokens_seen": 518140928, |
| "step": 506000 |
| }, |
| { |
| "epoch": 4.673328350910215, |
| "grad_norm": 1.6713037490844727, |
| "learning_rate": 2.663340437899632e-05, |
| "loss": 0.037, |
| "num_input_tokens_seen": 518652928, |
| "step": 506500 |
| }, |
| { |
| "epoch": 4.6779417056495145, |
| "grad_norm": 1.2307850122451782, |
| "learning_rate": 2.6610337605299823e-05, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 519164928, |
| "step": 507000 |
| }, |
| { |
| "epoch": 4.682555060388814, |
| "grad_norm": 1.2771391868591309, |
| "learning_rate": 2.6587270831603327e-05, |
| "loss": 0.0292, |
| "num_input_tokens_seen": 519676928, |
| "step": 507500 |
| }, |
| { |
| "epoch": 4.687168415128113, |
| "grad_norm": 1.468724012374878, |
| "learning_rate": 2.6564204057906828e-05, |
| "loss": 0.0314, |
| "num_input_tokens_seen": 520188928, |
| "step": 508000 |
| }, |
| { |
| "epoch": 4.691781769867412, |
| "grad_norm": 0.9526101350784302, |
| "learning_rate": 2.6541137284210332e-05, |
| "loss": 0.033, |
| "num_input_tokens_seen": 520700928, |
| "step": 508500 |
| }, |
| { |
| "epoch": 4.6963951246067115, |
| "grad_norm": 0.8857848048210144, |
| "learning_rate": 2.6518070510513836e-05, |
| "loss": 0.0333, |
| "num_input_tokens_seen": 521212928, |
| "step": 509000 |
| }, |
| { |
| "epoch": 4.701008479346011, |
| "grad_norm": 1.5435466766357422, |
| "learning_rate": 2.6495003736817343e-05, |
| "loss": 0.0319, |
| "num_input_tokens_seen": 521724928, |
| "step": 509500 |
| }, |
| { |
| "epoch": 4.70562183408531, |
| "grad_norm": 0.6249234676361084, |
| "learning_rate": 2.647193696312084e-05, |
| "loss": 0.0307, |
| "num_input_tokens_seen": 522236928, |
| "step": 510000 |
| }, |
| { |
| "epoch": 4.710235188824609, |
| "grad_norm": 0.7634549140930176, |
| "learning_rate": 2.6448870189424348e-05, |
| "loss": 0.0325, |
| "num_input_tokens_seen": 522748928, |
| "step": 510500 |
| }, |
| { |
| "epoch": 4.7148485435639085, |
| "grad_norm": 0.8510231375694275, |
| "learning_rate": 2.6425803415727852e-05, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 523260928, |
| "step": 511000 |
| }, |
| { |
| "epoch": 4.719461898303209, |
| "grad_norm": 0.797269344329834, |
| "learning_rate": 2.6402736642031356e-05, |
| "loss": 0.035, |
| "num_input_tokens_seen": 523772928, |
| "step": 511500 |
| }, |
| { |
| "epoch": 4.724075253042507, |
| "grad_norm": 1.6006139516830444, |
| "learning_rate": 2.6379669868334856e-05, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 524284928, |
| "step": 512000 |
| }, |
| { |
| "epoch": 4.728688607781807, |
| "grad_norm": 0.5628824234008789, |
| "learning_rate": 2.635660309463836e-05, |
| "loss": 0.0298, |
| "num_input_tokens_seen": 524796928, |
| "step": 512500 |
| }, |
| { |
| "epoch": 4.733301962521106, |
| "grad_norm": 1.2842258214950562, |
| "learning_rate": 2.6333536320941864e-05, |
| "loss": 0.0329, |
| "num_input_tokens_seen": 525308928, |
| "step": 513000 |
| }, |
| { |
| "epoch": 4.737915317260406, |
| "grad_norm": 1.3331750631332397, |
| "learning_rate": 2.6310469547245365e-05, |
| "loss": 0.0346, |
| "num_input_tokens_seen": 525820928, |
| "step": 513500 |
| }, |
| { |
| "epoch": 4.742528671999705, |
| "grad_norm": 2.3819310665130615, |
| "learning_rate": 2.628740277354887e-05, |
| "loss": 0.0339, |
| "num_input_tokens_seen": 526332928, |
| "step": 514000 |
| }, |
| { |
| "epoch": 4.747142026739004, |
| "grad_norm": 0.8976543545722961, |
| "learning_rate": 2.6264335999852373e-05, |
| "loss": 0.035, |
| "num_input_tokens_seen": 526844928, |
| "step": 514500 |
| }, |
| { |
| "epoch": 4.751755381478303, |
| "grad_norm": 2.7922868728637695, |
| "learning_rate": 2.624126922615588e-05, |
| "loss": 0.0344, |
| "num_input_tokens_seen": 527356928, |
| "step": 515000 |
| }, |
| { |
| "epoch": 4.756368736217603, |
| "grad_norm": 1.2664451599121094, |
| "learning_rate": 2.6218202452459377e-05, |
| "loss": 0.033, |
| "num_input_tokens_seen": 527868928, |
| "step": 515500 |
| }, |
| { |
| "epoch": 4.760982090956902, |
| "grad_norm": 1.8173182010650635, |
| "learning_rate": 2.6195135678762885e-05, |
| "loss": 0.033, |
| "num_input_tokens_seen": 528380928, |
| "step": 516000 |
| }, |
| { |
| "epoch": 4.765595445696201, |
| "grad_norm": 1.2038295269012451, |
| "learning_rate": 2.617206890506639e-05, |
| "loss": 0.0329, |
| "num_input_tokens_seen": 528892928, |
| "step": 516500 |
| }, |
| { |
| "epoch": 4.7702088004355, |
| "grad_norm": 1.3875302076339722, |
| "learning_rate": 2.6149002131369893e-05, |
| "loss": 0.0337, |
| "num_input_tokens_seen": 529404928, |
| "step": 517000 |
| }, |
| { |
| "epoch": 4.7748221551748, |
| "grad_norm": 0.6060103178024292, |
| "learning_rate": 2.6125935357673393e-05, |
| "loss": 0.0331, |
| "num_input_tokens_seen": 529916928, |
| "step": 517500 |
| }, |
| { |
| "epoch": 4.7794355099141, |
| "grad_norm": 3.217010259628296, |
| "learning_rate": 2.6102868583976897e-05, |
| "loss": 0.0365, |
| "num_input_tokens_seen": 530428928, |
| "step": 518000 |
| }, |
| { |
| "epoch": 4.784048864653399, |
| "grad_norm": 1.3630263805389404, |
| "learning_rate": 2.60798018102804e-05, |
| "loss": 0.0352, |
| "num_input_tokens_seen": 530940928, |
| "step": 518500 |
| }, |
| { |
| "epoch": 4.788662219392698, |
| "grad_norm": 1.875205397605896, |
| "learning_rate": 2.605673503658391e-05, |
| "loss": 0.0312, |
| "num_input_tokens_seen": 531452928, |
| "step": 519000 |
| }, |
| { |
| "epoch": 4.7932755741319975, |
| "grad_norm": 1.0889365673065186, |
| "learning_rate": 2.6033668262887406e-05, |
| "loss": 0.032, |
| "num_input_tokens_seen": 531964928, |
| "step": 519500 |
| }, |
| { |
| "epoch": 4.797888928871297, |
| "grad_norm": 1.8945229053497314, |
| "learning_rate": 2.601060148919091e-05, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 532476928, |
| "step": 520000 |
| }, |
| { |
| "epoch": 4.802502283610596, |
| "grad_norm": 0.8704883456230164, |
| "learning_rate": 2.5987534715494417e-05, |
| "loss": 0.0353, |
| "num_input_tokens_seen": 532988928, |
| "step": 520500 |
| }, |
| { |
| "epoch": 4.807115638349895, |
| "grad_norm": 0.5920878052711487, |
| "learning_rate": 2.5964467941797914e-05, |
| "loss": 0.0352, |
| "num_input_tokens_seen": 533500928, |
| "step": 521000 |
| }, |
| { |
| "epoch": 4.811728993089194, |
| "grad_norm": 1.7447361946105957, |
| "learning_rate": 2.594140116810142e-05, |
| "loss": 0.0333, |
| "num_input_tokens_seen": 534012928, |
| "step": 521500 |
| }, |
| { |
| "epoch": 4.816342347828494, |
| "grad_norm": 2.5715444087982178, |
| "learning_rate": 2.5918334394404926e-05, |
| "loss": 0.0331, |
| "num_input_tokens_seen": 534524928, |
| "step": 522000 |
| }, |
| { |
| "epoch": 4.820955702567793, |
| "grad_norm": 1.5223846435546875, |
| "learning_rate": 2.589526762070843e-05, |
| "loss": 0.0326, |
| "num_input_tokens_seen": 535036928, |
| "step": 522500 |
| }, |
| { |
| "epoch": 4.825569057307092, |
| "grad_norm": 1.0512726306915283, |
| "learning_rate": 2.587220084701193e-05, |
| "loss": 0.0312, |
| "num_input_tokens_seen": 535548928, |
| "step": 523000 |
| }, |
| { |
| "epoch": 4.830182412046392, |
| "grad_norm": 1.2424243688583374, |
| "learning_rate": 2.5849134073315434e-05, |
| "loss": 0.0356, |
| "num_input_tokens_seen": 536060928, |
| "step": 523500 |
| }, |
| { |
| "epoch": 4.834795766785692, |
| "grad_norm": 1.2689915895462036, |
| "learning_rate": 2.5826067299618938e-05, |
| "loss": 0.0317, |
| "num_input_tokens_seen": 536572928, |
| "step": 524000 |
| }, |
| { |
| "epoch": 4.839409121524991, |
| "grad_norm": 0.5996227860450745, |
| "learning_rate": 2.5803000525922445e-05, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 537084928, |
| "step": 524500 |
| }, |
| { |
| "epoch": 4.84402247626429, |
| "grad_norm": 1.7113879919052124, |
| "learning_rate": 2.5779933752225943e-05, |
| "loss": 0.0322, |
| "num_input_tokens_seen": 537596928, |
| "step": 525000 |
| }, |
| { |
| "epoch": 4.848635831003589, |
| "grad_norm": 5.173702239990234, |
| "learning_rate": 2.5756866978529447e-05, |
| "loss": 0.0338, |
| "num_input_tokens_seen": 538108928, |
| "step": 525500 |
| }, |
| { |
| "epoch": 4.8532491857428885, |
| "grad_norm": 2.208484172821045, |
| "learning_rate": 2.5733800204832954e-05, |
| "loss": 0.0335, |
| "num_input_tokens_seen": 538620928, |
| "step": 526000 |
| }, |
| { |
| "epoch": 4.857862540482188, |
| "grad_norm": 0.7695846557617188, |
| "learning_rate": 2.571073343113645e-05, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 539132928, |
| "step": 526500 |
| }, |
| { |
| "epoch": 4.862475895221487, |
| "grad_norm": 0.6419717073440552, |
| "learning_rate": 2.568766665743996e-05, |
| "loss": 0.0313, |
| "num_input_tokens_seen": 539644928, |
| "step": 527000 |
| }, |
| { |
| "epoch": 4.867089249960786, |
| "grad_norm": 0.4510629177093506, |
| "learning_rate": 2.5664599883743462e-05, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 540156928, |
| "step": 527500 |
| }, |
| { |
| "epoch": 4.8717026047000855, |
| "grad_norm": 0.6697828769683838, |
| "learning_rate": 2.5641533110046966e-05, |
| "loss": 0.0306, |
| "num_input_tokens_seen": 540668928, |
| "step": 528000 |
| }, |
| { |
| "epoch": 4.876315959439385, |
| "grad_norm": 0.30349186062812805, |
| "learning_rate": 2.5618466336350467e-05, |
| "loss": 0.0354, |
| "num_input_tokens_seen": 541180928, |
| "step": 528500 |
| }, |
| { |
| "epoch": 4.880929314178685, |
| "grad_norm": 0.9010013937950134, |
| "learning_rate": 2.559539956265397e-05, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 541692928, |
| "step": 529000 |
| }, |
| { |
| "epoch": 4.885542668917984, |
| "grad_norm": 5.212312698364258, |
| "learning_rate": 2.5572332788957475e-05, |
| "loss": 0.0338, |
| "num_input_tokens_seen": 542204928, |
| "step": 529500 |
| }, |
| { |
| "epoch": 4.890156023657283, |
| "grad_norm": 0.5742513537406921, |
| "learning_rate": 2.5549266015260982e-05, |
| "loss": 0.0331, |
| "num_input_tokens_seen": 542716928, |
| "step": 530000 |
| }, |
| { |
| "epoch": 4.894769378396583, |
| "grad_norm": 1.1083173751831055, |
| "learning_rate": 2.552619924156448e-05, |
| "loss": 0.0332, |
| "num_input_tokens_seen": 543228928, |
| "step": 530500 |
| }, |
| { |
| "epoch": 4.899382733135882, |
| "grad_norm": 2.323056697845459, |
| "learning_rate": 2.5503132467867983e-05, |
| "loss": 0.0316, |
| "num_input_tokens_seen": 543740928, |
| "step": 531000 |
| }, |
| { |
| "epoch": 4.903996087875181, |
| "grad_norm": 0.8404493927955627, |
| "learning_rate": 2.548006569417149e-05, |
| "loss": 0.0325, |
| "num_input_tokens_seen": 544252928, |
| "step": 531500 |
| }, |
| { |
| "epoch": 4.90860944261448, |
| "grad_norm": 0.7807884216308594, |
| "learning_rate": 2.5456998920474995e-05, |
| "loss": 0.034, |
| "num_input_tokens_seen": 544764928, |
| "step": 532000 |
| }, |
| { |
| "epoch": 4.91322279735378, |
| "grad_norm": 1.5149301290512085, |
| "learning_rate": 2.5433932146778495e-05, |
| "loss": 0.0329, |
| "num_input_tokens_seen": 545276928, |
| "step": 532500 |
| }, |
| { |
| "epoch": 4.917836152093079, |
| "grad_norm": 2.3330907821655273, |
| "learning_rate": 2.5410865373082e-05, |
| "loss": 0.0315, |
| "num_input_tokens_seen": 545788928, |
| "step": 533000 |
| }, |
| { |
| "epoch": 4.922449506832378, |
| "grad_norm": 0.9304101467132568, |
| "learning_rate": 2.5387798599385503e-05, |
| "loss": 0.0316, |
| "num_input_tokens_seen": 546300928, |
| "step": 533500 |
| }, |
| { |
| "epoch": 4.927062861571677, |
| "grad_norm": 1.3839999437332153, |
| "learning_rate": 2.5364731825689004e-05, |
| "loss": 0.0339, |
| "num_input_tokens_seen": 546812928, |
| "step": 534000 |
| }, |
| { |
| "epoch": 4.931676216310977, |
| "grad_norm": 1.3032892942428589, |
| "learning_rate": 2.5341665051992508e-05, |
| "loss": 0.0377, |
| "num_input_tokens_seen": 547324928, |
| "step": 534500 |
| }, |
| { |
| "epoch": 4.936289571050276, |
| "grad_norm": 0.5184182524681091, |
| "learning_rate": 2.5318598278296012e-05, |
| "loss": 0.0327, |
| "num_input_tokens_seen": 547836928, |
| "step": 535000 |
| }, |
| { |
| "epoch": 4.940902925789576, |
| "grad_norm": 4.176392078399658, |
| "learning_rate": 2.529553150459952e-05, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 548348928, |
| "step": 535500 |
| }, |
| { |
| "epoch": 4.945516280528875, |
| "grad_norm": 1.8942577838897705, |
| "learning_rate": 2.5272464730903016e-05, |
| "loss": 0.0312, |
| "num_input_tokens_seen": 548860928, |
| "step": 536000 |
| }, |
| { |
| "epoch": 4.9501296352681745, |
| "grad_norm": 0.4011167585849762, |
| "learning_rate": 2.524939795720652e-05, |
| "loss": 0.0297, |
| "num_input_tokens_seen": 549372928, |
| "step": 536500 |
| }, |
| { |
| "epoch": 4.954742990007474, |
| "grad_norm": 1.2499672174453735, |
| "learning_rate": 2.5226331183510028e-05, |
| "loss": 0.0351, |
| "num_input_tokens_seen": 549884928, |
| "step": 537000 |
| }, |
| { |
| "epoch": 4.959356344746773, |
| "grad_norm": 1.7503982782363892, |
| "learning_rate": 2.520326440981353e-05, |
| "loss": 0.0346, |
| "num_input_tokens_seen": 550396928, |
| "step": 537500 |
| }, |
| { |
| "epoch": 4.963969699486072, |
| "grad_norm": 0.9771599173545837, |
| "learning_rate": 2.5180197636117032e-05, |
| "loss": 0.0344, |
| "num_input_tokens_seen": 550908928, |
| "step": 538000 |
| }, |
| { |
| "epoch": 4.9685830542253715, |
| "grad_norm": 1.7374619245529175, |
| "learning_rate": 2.5157130862420536e-05, |
| "loss": 0.0328, |
| "num_input_tokens_seen": 551420928, |
| "step": 538500 |
| }, |
| { |
| "epoch": 4.973196408964671, |
| "grad_norm": 2.459627866744995, |
| "learning_rate": 2.513406408872404e-05, |
| "loss": 0.0304, |
| "num_input_tokens_seen": 551932928, |
| "step": 539000 |
| }, |
| { |
| "epoch": 4.97780976370397, |
| "grad_norm": 1.0150238275527954, |
| "learning_rate": 2.511099731502754e-05, |
| "loss": 0.0341, |
| "num_input_tokens_seen": 552444928, |
| "step": 539500 |
| }, |
| { |
| "epoch": 4.982423118443269, |
| "grad_norm": 0.5386485457420349, |
| "learning_rate": 2.5087930541331045e-05, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 552956928, |
| "step": 540000 |
| }, |
| { |
| "epoch": 4.9870364731825685, |
| "grad_norm": 2.0339949131011963, |
| "learning_rate": 2.506486376763455e-05, |
| "loss": 0.0308, |
| "num_input_tokens_seen": 553468928, |
| "step": 540500 |
| }, |
| { |
| "epoch": 4.991649827921869, |
| "grad_norm": 0.7838632464408875, |
| "learning_rate": 2.5041796993938056e-05, |
| "loss": 0.0335, |
| "num_input_tokens_seen": 553980928, |
| "step": 541000 |
| }, |
| { |
| "epoch": 4.996263182661168, |
| "grad_norm": 1.2253855466842651, |
| "learning_rate": 2.5018730220241553e-05, |
| "loss": 0.0325, |
| "num_input_tokens_seen": 554492928, |
| "step": 541500 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_combined_score": 0.0704431934497777, |
| "eval_loss": 0.07044319063425064, |
| "eval_mse": 0.07044319626530475, |
| "eval_runtime": 45.8855, |
| "eval_samples_per_second": 2099.529, |
| "eval_steps_per_second": 262.457, |
| "num_input_tokens_seen": 554906880, |
| "step": 541905 |
| }, |
| { |
| "epoch": 5.000876537400467, |
| "grad_norm": 1.9685852527618408, |
| "learning_rate": 2.499566344654506e-05, |
| "loss": 0.0352, |
| "num_input_tokens_seen": 555004160, |
| "step": 542000 |
| }, |
| { |
| "epoch": 5.005489892139766, |
| "grad_norm": 1.419827938079834, |
| "learning_rate": 2.4972596672848565e-05, |
| "loss": 0.0302, |
| "num_input_tokens_seen": 555516160, |
| "step": 542500 |
| }, |
| { |
| "epoch": 5.010103246879066, |
| "grad_norm": 3.999183177947998, |
| "learning_rate": 2.4949529899152065e-05, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 556028160, |
| "step": 543000 |
| }, |
| { |
| "epoch": 5.014716601618365, |
| "grad_norm": 1.758694052696228, |
| "learning_rate": 2.4926463125455573e-05, |
| "loss": 0.0261, |
| "num_input_tokens_seen": 556540160, |
| "step": 543500 |
| }, |
| { |
| "epoch": 5.019329956357664, |
| "grad_norm": 1.1982614994049072, |
| "learning_rate": 2.4903396351759073e-05, |
| "loss": 0.0245, |
| "num_input_tokens_seen": 557052160, |
| "step": 544000 |
| }, |
| { |
| "epoch": 5.023943311096963, |
| "grad_norm": 0.8155698180198669, |
| "learning_rate": 2.4880329578062577e-05, |
| "loss": 0.0252, |
| "num_input_tokens_seen": 557564160, |
| "step": 544500 |
| }, |
| { |
| "epoch": 5.028556665836263, |
| "grad_norm": 0.5454326272010803, |
| "learning_rate": 2.485726280436608e-05, |
| "loss": 0.0243, |
| "num_input_tokens_seen": 558076160, |
| "step": 545000 |
| }, |
| { |
| "epoch": 5.033170020575562, |
| "grad_norm": 0.35681942105293274, |
| "learning_rate": 2.4834196030669585e-05, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 558588160, |
| "step": 545500 |
| }, |
| { |
| "epoch": 5.037783375314861, |
| "grad_norm": 1.3723911046981812, |
| "learning_rate": 2.4811129256973086e-05, |
| "loss": 0.0254, |
| "num_input_tokens_seen": 559100160, |
| "step": 546000 |
| }, |
| { |
| "epoch": 5.042396730054161, |
| "grad_norm": 2.3160240650177, |
| "learning_rate": 2.478806248327659e-05, |
| "loss": 0.0238, |
| "num_input_tokens_seen": 559612160, |
| "step": 546500 |
| }, |
| { |
| "epoch": 5.0470100847934605, |
| "grad_norm": 0.447410523891449, |
| "learning_rate": 2.4764995709580094e-05, |
| "loss": 0.0245, |
| "num_input_tokens_seen": 560124160, |
| "step": 547000 |
| }, |
| { |
| "epoch": 5.05162343953276, |
| "grad_norm": 1.798653483390808, |
| "learning_rate": 2.4741928935883598e-05, |
| "loss": 0.0264, |
| "num_input_tokens_seen": 560636160, |
| "step": 547500 |
| }, |
| { |
| "epoch": 5.056236794272059, |
| "grad_norm": 0.5568801164627075, |
| "learning_rate": 2.47188621621871e-05, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 561148160, |
| "step": 548000 |
| }, |
| { |
| "epoch": 5.060850149011358, |
| "grad_norm": 0.5296237468719482, |
| "learning_rate": 2.4695795388490602e-05, |
| "loss": 0.0257, |
| "num_input_tokens_seen": 561660160, |
| "step": 548500 |
| }, |
| { |
| "epoch": 5.0654635037506575, |
| "grad_norm": 1.8144594430923462, |
| "learning_rate": 2.467272861479411e-05, |
| "loss": 0.0244, |
| "num_input_tokens_seen": 562172160, |
| "step": 549000 |
| }, |
| { |
| "epoch": 5.070076858489957, |
| "grad_norm": 1.125553846359253, |
| "learning_rate": 2.464966184109761e-05, |
| "loss": 0.0278, |
| "num_input_tokens_seen": 562684160, |
| "step": 549500 |
| }, |
| { |
| "epoch": 5.074690213229256, |
| "grad_norm": 1.2279289960861206, |
| "learning_rate": 2.4626595067401114e-05, |
| "loss": 0.0254, |
| "num_input_tokens_seen": 563196160, |
| "step": 550000 |
| }, |
| { |
| "epoch": 5.079303567968555, |
| "grad_norm": 1.1253972053527832, |
| "learning_rate": 2.4603528293704618e-05, |
| "loss": 0.0273, |
| "num_input_tokens_seen": 563708160, |
| "step": 550500 |
| }, |
| { |
| "epoch": 5.0839169227078544, |
| "grad_norm": 1.958179235458374, |
| "learning_rate": 2.4580461520008122e-05, |
| "loss": 0.0227, |
| "num_input_tokens_seen": 564220160, |
| "step": 551000 |
| }, |
| { |
| "epoch": 5.088530277447154, |
| "grad_norm": 1.6592975854873657, |
| "learning_rate": 2.4557394746311622e-05, |
| "loss": 0.0265, |
| "num_input_tokens_seen": 564732160, |
| "step": 551500 |
| }, |
| { |
| "epoch": 5.093143632186453, |
| "grad_norm": 0.9499948024749756, |
| "learning_rate": 2.453432797261513e-05, |
| "loss": 0.0257, |
| "num_input_tokens_seen": 565244160, |
| "step": 552000 |
| }, |
| { |
| "epoch": 5.097756986925753, |
| "grad_norm": 0.7857697606086731, |
| "learning_rate": 2.451126119891863e-05, |
| "loss": 0.0256, |
| "num_input_tokens_seen": 565756160, |
| "step": 552500 |
| }, |
| { |
| "epoch": 5.102370341665052, |
| "grad_norm": 1.4605727195739746, |
| "learning_rate": 2.4488194425222134e-05, |
| "loss": 0.0241, |
| "num_input_tokens_seen": 566268160, |
| "step": 553000 |
| }, |
| { |
| "epoch": 5.106983696404352, |
| "grad_norm": 1.2469509840011597, |
| "learning_rate": 2.446512765152564e-05, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 566780160, |
| "step": 553500 |
| }, |
| { |
| "epoch": 5.111597051143651, |
| "grad_norm": 1.826318383216858, |
| "learning_rate": 2.444206087782914e-05, |
| "loss": 0.0288, |
| "num_input_tokens_seen": 567292160, |
| "step": 554000 |
| }, |
| { |
| "epoch": 5.11621040588295, |
| "grad_norm": 4.358790397644043, |
| "learning_rate": 2.4418994104132646e-05, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 567804160, |
| "step": 554500 |
| }, |
| { |
| "epoch": 5.120823760622249, |
| "grad_norm": 1.07144033908844, |
| "learning_rate": 2.4395927330436147e-05, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 568316160, |
| "step": 555000 |
| }, |
| { |
| "epoch": 5.125437115361549, |
| "grad_norm": 1.7916905879974365, |
| "learning_rate": 2.437286055673965e-05, |
| "loss": 0.0268, |
| "num_input_tokens_seen": 568828160, |
| "step": 555500 |
| }, |
| { |
| "epoch": 5.130050470100848, |
| "grad_norm": 0.9158410429954529, |
| "learning_rate": 2.4349793783043155e-05, |
| "loss": 0.0263, |
| "num_input_tokens_seen": 569340160, |
| "step": 556000 |
| }, |
| { |
| "epoch": 5.134663824840147, |
| "grad_norm": 0.7724267244338989, |
| "learning_rate": 2.432672700934666e-05, |
| "loss": 0.0244, |
| "num_input_tokens_seen": 569852160, |
| "step": 556500 |
| }, |
| { |
| "epoch": 5.139277179579446, |
| "grad_norm": 0.48507311940193176, |
| "learning_rate": 2.430366023565016e-05, |
| "loss": 0.0274, |
| "num_input_tokens_seen": 570364160, |
| "step": 557000 |
| }, |
| { |
| "epoch": 5.1438905343187455, |
| "grad_norm": 0.6313498616218567, |
| "learning_rate": 2.4280593461953667e-05, |
| "loss": 0.0239, |
| "num_input_tokens_seen": 570876160, |
| "step": 557500 |
| }, |
| { |
| "epoch": 5.148503889058045, |
| "grad_norm": 0.987579345703125, |
| "learning_rate": 2.4257526688257167e-05, |
| "loss": 0.0255, |
| "num_input_tokens_seen": 571388160, |
| "step": 558000 |
| }, |
| { |
| "epoch": 5.153117243797345, |
| "grad_norm": 1.7795839309692383, |
| "learning_rate": 2.423445991456067e-05, |
| "loss": 0.0245, |
| "num_input_tokens_seen": 571900160, |
| "step": 558500 |
| }, |
| { |
| "epoch": 5.157730598536644, |
| "grad_norm": 1.233028531074524, |
| "learning_rate": 2.4211393140864175e-05, |
| "loss": 0.0272, |
| "num_input_tokens_seen": 572412160, |
| "step": 559000 |
| }, |
| { |
| "epoch": 5.162343953275943, |
| "grad_norm": 0.9197332262992859, |
| "learning_rate": 2.4188326367167676e-05, |
| "loss": 0.0243, |
| "num_input_tokens_seen": 572924160, |
| "step": 559500 |
| }, |
| { |
| "epoch": 5.166957308015243, |
| "grad_norm": 5.717777252197266, |
| "learning_rate": 2.4165259593471183e-05, |
| "loss": 0.024, |
| "num_input_tokens_seen": 573436160, |
| "step": 560000 |
| }, |
| { |
| "epoch": 5.171570662754542, |
| "grad_norm": 0.8062294721603394, |
| "learning_rate": 2.4142192819774684e-05, |
| "loss": 0.025, |
| "num_input_tokens_seen": 573948160, |
| "step": 560500 |
| }, |
| { |
| "epoch": 5.176184017493841, |
| "grad_norm": 1.5993818044662476, |
| "learning_rate": 2.4119126046078188e-05, |
| "loss": 0.0276, |
| "num_input_tokens_seen": 574460160, |
| "step": 561000 |
| }, |
| { |
| "epoch": 5.18079737223314, |
| "grad_norm": 1.086608648300171, |
| "learning_rate": 2.4096059272381692e-05, |
| "loss": 0.0237, |
| "num_input_tokens_seen": 574972160, |
| "step": 561500 |
| }, |
| { |
| "epoch": 5.18541072697244, |
| "grad_norm": 0.5633468627929688, |
| "learning_rate": 2.4072992498685196e-05, |
| "loss": 0.0267, |
| "num_input_tokens_seen": 575484160, |
| "step": 562000 |
| }, |
| { |
| "epoch": 5.190024081711739, |
| "grad_norm": 0.9681257605552673, |
| "learning_rate": 2.4049925724988696e-05, |
| "loss": 0.0247, |
| "num_input_tokens_seen": 575996160, |
| "step": 562500 |
| }, |
| { |
| "epoch": 5.194637436451038, |
| "grad_norm": 0.5693821907043457, |
| "learning_rate": 2.4026858951292204e-05, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 576508160, |
| "step": 563000 |
| }, |
| { |
| "epoch": 5.199250791190337, |
| "grad_norm": 0.5459065437316895, |
| "learning_rate": 2.4003792177595704e-05, |
| "loss": 0.0246, |
| "num_input_tokens_seen": 577020160, |
| "step": 563500 |
| }, |
| { |
| "epoch": 5.2038641459296375, |
| "grad_norm": 0.8124216198921204, |
| "learning_rate": 2.3980725403899208e-05, |
| "loss": 0.0261, |
| "num_input_tokens_seen": 577532160, |
| "step": 564000 |
| }, |
| { |
| "epoch": 5.208477500668937, |
| "grad_norm": 2.0479400157928467, |
| "learning_rate": 2.3957658630202712e-05, |
| "loss": 0.0263, |
| "num_input_tokens_seen": 578044160, |
| "step": 564500 |
| }, |
| { |
| "epoch": 5.213090855408236, |
| "grad_norm": 0.4062500596046448, |
| "learning_rate": 2.3934591856506216e-05, |
| "loss": 0.0293, |
| "num_input_tokens_seen": 578556160, |
| "step": 565000 |
| }, |
| { |
| "epoch": 5.217704210147535, |
| "grad_norm": 0.6792827844619751, |
| "learning_rate": 2.391152508280972e-05, |
| "loss": 0.0243, |
| "num_input_tokens_seen": 579068160, |
| "step": 565500 |
| }, |
| { |
| "epoch": 5.2223175648868345, |
| "grad_norm": 1.978621482849121, |
| "learning_rate": 2.388845830911322e-05, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 579580160, |
| "step": 566000 |
| }, |
| { |
| "epoch": 5.226930919626134, |
| "grad_norm": 1.0961169004440308, |
| "learning_rate": 2.3865391535416725e-05, |
| "loss": 0.0264, |
| "num_input_tokens_seen": 580092160, |
| "step": 566500 |
| }, |
| { |
| "epoch": 5.231544274365433, |
| "grad_norm": 2.3269541263580322, |
| "learning_rate": 2.384232476172023e-05, |
| "loss": 0.0246, |
| "num_input_tokens_seen": 580604160, |
| "step": 567000 |
| }, |
| { |
| "epoch": 5.236157629104732, |
| "grad_norm": 0.545312762260437, |
| "learning_rate": 2.3819257988023733e-05, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 581116160, |
| "step": 567500 |
| }, |
| { |
| "epoch": 5.2407709838440315, |
| "grad_norm": 0.7577276825904846, |
| "learning_rate": 2.3796191214327233e-05, |
| "loss": 0.026, |
| "num_input_tokens_seen": 581628160, |
| "step": 568000 |
| }, |
| { |
| "epoch": 5.245384338583331, |
| "grad_norm": 0.5405977964401245, |
| "learning_rate": 2.377312444063074e-05, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 582140160, |
| "step": 568500 |
| }, |
| { |
| "epoch": 5.24999769332263, |
| "grad_norm": 0.5924959182739258, |
| "learning_rate": 2.375005766693424e-05, |
| "loss": 0.0264, |
| "num_input_tokens_seen": 582652160, |
| "step": 569000 |
| }, |
| { |
| "epoch": 5.25461104806193, |
| "grad_norm": 1.2683016061782837, |
| "learning_rate": 2.3726990893237745e-05, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 583164160, |
| "step": 569500 |
| }, |
| { |
| "epoch": 5.259224402801229, |
| "grad_norm": 1.1642249822616577, |
| "learning_rate": 2.370392411954125e-05, |
| "loss": 0.0263, |
| "num_input_tokens_seen": 583676160, |
| "step": 570000 |
| }, |
| { |
| "epoch": 5.263837757540529, |
| "grad_norm": 1.1712781190872192, |
| "learning_rate": 2.3680857345844753e-05, |
| "loss": 0.0254, |
| "num_input_tokens_seen": 584188160, |
| "step": 570500 |
| }, |
| { |
| "epoch": 5.268451112279828, |
| "grad_norm": 1.0108134746551514, |
| "learning_rate": 2.3657790572148257e-05, |
| "loss": 0.0256, |
| "num_input_tokens_seen": 584700160, |
| "step": 571000 |
| }, |
| { |
| "epoch": 5.273064467019127, |
| "grad_norm": 2.7338948249816895, |
| "learning_rate": 2.363472379845176e-05, |
| "loss": 0.0275, |
| "num_input_tokens_seen": 585212160, |
| "step": 571500 |
| }, |
| { |
| "epoch": 5.277677821758426, |
| "grad_norm": 0.6406319737434387, |
| "learning_rate": 2.361165702475526e-05, |
| "loss": 0.0238, |
| "num_input_tokens_seen": 585724160, |
| "step": 572000 |
| }, |
| { |
| "epoch": 5.282291176497726, |
| "grad_norm": 1.551131010055542, |
| "learning_rate": 2.3588590251058766e-05, |
| "loss": 0.0261, |
| "num_input_tokens_seen": 586236160, |
| "step": 572500 |
| }, |
| { |
| "epoch": 5.286904531237025, |
| "grad_norm": 0.41061103343963623, |
| "learning_rate": 2.356552347736227e-05, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 586748160, |
| "step": 573000 |
| }, |
| { |
| "epoch": 5.291517885976324, |
| "grad_norm": 0.7769986987113953, |
| "learning_rate": 2.354245670366577e-05, |
| "loss": 0.0251, |
| "num_input_tokens_seen": 587260160, |
| "step": 573500 |
| }, |
| { |
| "epoch": 5.296131240715623, |
| "grad_norm": 1.0587828159332275, |
| "learning_rate": 2.3519389929969277e-05, |
| "loss": 0.024, |
| "num_input_tokens_seen": 587772160, |
| "step": 574000 |
| }, |
| { |
| "epoch": 5.300744595454923, |
| "grad_norm": 0.7457670569419861, |
| "learning_rate": 2.3496323156272778e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 588284160, |
| "step": 574500 |
| }, |
| { |
| "epoch": 5.305357950194222, |
| "grad_norm": 1.7087829113006592, |
| "learning_rate": 2.3473256382576282e-05, |
| "loss": 0.0265, |
| "num_input_tokens_seen": 588796160, |
| "step": 575000 |
| }, |
| { |
| "epoch": 5.309971304933521, |
| "grad_norm": 1.6121881008148193, |
| "learning_rate": 2.3450189608879786e-05, |
| "loss": 0.0236, |
| "num_input_tokens_seen": 589308160, |
| "step": 575500 |
| }, |
| { |
| "epoch": 5.314584659672821, |
| "grad_norm": 1.585402011871338, |
| "learning_rate": 2.342712283518329e-05, |
| "loss": 0.0253, |
| "num_input_tokens_seen": 589820160, |
| "step": 576000 |
| }, |
| { |
| "epoch": 5.3191980144121205, |
| "grad_norm": 2.160334348678589, |
| "learning_rate": 2.3404056061486794e-05, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 590332160, |
| "step": 576500 |
| }, |
| { |
| "epoch": 5.32381136915142, |
| "grad_norm": 0.304321825504303, |
| "learning_rate": 2.3380989287790298e-05, |
| "loss": 0.0268, |
| "num_input_tokens_seen": 590844160, |
| "step": 577000 |
| }, |
| { |
| "epoch": 5.328424723890719, |
| "grad_norm": 0.9023957848548889, |
| "learning_rate": 2.33579225140938e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 591356160, |
| "step": 577500 |
| }, |
| { |
| "epoch": 5.333038078630018, |
| "grad_norm": 0.5087705254554749, |
| "learning_rate": 2.3334855740397306e-05, |
| "loss": 0.026, |
| "num_input_tokens_seen": 591868160, |
| "step": 578000 |
| }, |
| { |
| "epoch": 5.3376514333693175, |
| "grad_norm": 1.3647748231887817, |
| "learning_rate": 2.3311788966700806e-05, |
| "loss": 0.0268, |
| "num_input_tokens_seen": 592380160, |
| "step": 578500 |
| }, |
| { |
| "epoch": 5.342264788108617, |
| "grad_norm": 1.011982798576355, |
| "learning_rate": 2.328872219300431e-05, |
| "loss": 0.0267, |
| "num_input_tokens_seen": 592892160, |
| "step": 579000 |
| }, |
| { |
| "epoch": 5.346878142847916, |
| "grad_norm": 1.695412516593933, |
| "learning_rate": 2.3265655419307814e-05, |
| "loss": 0.0244, |
| "num_input_tokens_seen": 593404160, |
| "step": 579500 |
| }, |
| { |
| "epoch": 5.351491497587215, |
| "grad_norm": 2.6255669593811035, |
| "learning_rate": 2.3242588645611315e-05, |
| "loss": 0.0279, |
| "num_input_tokens_seen": 593916160, |
| "step": 580000 |
| }, |
| { |
| "epoch": 5.3561048523265145, |
| "grad_norm": 1.49470055103302, |
| "learning_rate": 2.321952187191482e-05, |
| "loss": 0.0255, |
| "num_input_tokens_seen": 594428160, |
| "step": 580500 |
| }, |
| { |
| "epoch": 5.360718207065814, |
| "grad_norm": 5.862457275390625, |
| "learning_rate": 2.3196455098218323e-05, |
| "loss": 0.0272, |
| "num_input_tokens_seen": 594940160, |
| "step": 581000 |
| }, |
| { |
| "epoch": 5.365331561805114, |
| "grad_norm": 1.1416678428649902, |
| "learning_rate": 2.3173388324521827e-05, |
| "loss": 0.0257, |
| "num_input_tokens_seen": 595452160, |
| "step": 581500 |
| }, |
| { |
| "epoch": 5.369944916544413, |
| "grad_norm": 1.0137473344802856, |
| "learning_rate": 2.315032155082533e-05, |
| "loss": 0.0278, |
| "num_input_tokens_seen": 595964160, |
| "step": 582000 |
| }, |
| { |
| "epoch": 5.374558271283712, |
| "grad_norm": 1.037350058555603, |
| "learning_rate": 2.3127254777128835e-05, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 596476160, |
| "step": 582500 |
| }, |
| { |
| "epoch": 5.379171626023012, |
| "grad_norm": 0.5939755439758301, |
| "learning_rate": 2.3104188003432335e-05, |
| "loss": 0.0253, |
| "num_input_tokens_seen": 596988160, |
| "step": 583000 |
| }, |
| { |
| "epoch": 5.383784980762311, |
| "grad_norm": 0.8637872934341431, |
| "learning_rate": 2.3081121229735843e-05, |
| "loss": 0.0294, |
| "num_input_tokens_seen": 597500160, |
| "step": 583500 |
| }, |
| { |
| "epoch": 5.38839833550161, |
| "grad_norm": 0.6153502464294434, |
| "learning_rate": 2.3058054456039343e-05, |
| "loss": 0.0252, |
| "num_input_tokens_seen": 598012160, |
| "step": 584000 |
| }, |
| { |
| "epoch": 5.393011690240909, |
| "grad_norm": 0.7826283574104309, |
| "learning_rate": 2.3034987682342847e-05, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 598524160, |
| "step": 584500 |
| }, |
| { |
| "epoch": 5.397625044980209, |
| "grad_norm": 0.8609397411346436, |
| "learning_rate": 2.301192090864635e-05, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 599036160, |
| "step": 585000 |
| }, |
| { |
| "epoch": 5.402238399719508, |
| "grad_norm": 1.031718134880066, |
| "learning_rate": 2.2988854134949852e-05, |
| "loss": 0.0264, |
| "num_input_tokens_seen": 599548160, |
| "step": 585500 |
| }, |
| { |
| "epoch": 5.406851754458807, |
| "grad_norm": 4.244394779205322, |
| "learning_rate": 2.296578736125336e-05, |
| "loss": 0.0284, |
| "num_input_tokens_seen": 600060160, |
| "step": 586000 |
| }, |
| { |
| "epoch": 5.411465109198106, |
| "grad_norm": 0.6755638122558594, |
| "learning_rate": 2.294272058755686e-05, |
| "loss": 0.0256, |
| "num_input_tokens_seen": 600572160, |
| "step": 586500 |
| }, |
| { |
| "epoch": 5.416078463937406, |
| "grad_norm": 0.5303651690483093, |
| "learning_rate": 2.2919653813860364e-05, |
| "loss": 0.0272, |
| "num_input_tokens_seen": 601084160, |
| "step": 587000 |
| }, |
| { |
| "epoch": 5.420691818676706, |
| "grad_norm": 0.8649631142616272, |
| "learning_rate": 2.2896587040163868e-05, |
| "loss": 0.0245, |
| "num_input_tokens_seen": 601596160, |
| "step": 587500 |
| }, |
| { |
| "epoch": 5.425305173416005, |
| "grad_norm": 0.5191958546638489, |
| "learning_rate": 2.287352026646737e-05, |
| "loss": 0.0271, |
| "num_input_tokens_seen": 602108160, |
| "step": 588000 |
| }, |
| { |
| "epoch": 5.429918528155304, |
| "grad_norm": 1.2616572380065918, |
| "learning_rate": 2.2850453492770872e-05, |
| "loss": 0.0271, |
| "num_input_tokens_seen": 602620160, |
| "step": 588500 |
| }, |
| { |
| "epoch": 5.434531882894603, |
| "grad_norm": 0.8619266152381897, |
| "learning_rate": 2.282738671907438e-05, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 603132160, |
| "step": 589000 |
| }, |
| { |
| "epoch": 5.439145237633903, |
| "grad_norm": 0.7039788961410522, |
| "learning_rate": 2.280431994537788e-05, |
| "loss": 0.0247, |
| "num_input_tokens_seen": 603644160, |
| "step": 589500 |
| }, |
| { |
| "epoch": 5.443758592373202, |
| "grad_norm": 2.772310495376587, |
| "learning_rate": 2.2781253171681384e-05, |
| "loss": 0.0267, |
| "num_input_tokens_seen": 604156160, |
| "step": 590000 |
| }, |
| { |
| "epoch": 5.448371947112501, |
| "grad_norm": 0.5451655387878418, |
| "learning_rate": 2.2758186397984888e-05, |
| "loss": 0.0261, |
| "num_input_tokens_seen": 604668160, |
| "step": 590500 |
| }, |
| { |
| "epoch": 5.4529853018518, |
| "grad_norm": 0.8995614647865295, |
| "learning_rate": 2.2735119624288392e-05, |
| "loss": 0.024, |
| "num_input_tokens_seen": 605180160, |
| "step": 591000 |
| }, |
| { |
| "epoch": 5.4575986565911, |
| "grad_norm": 1.981187105178833, |
| "learning_rate": 2.2712052850591896e-05, |
| "loss": 0.0263, |
| "num_input_tokens_seen": 605692160, |
| "step": 591500 |
| }, |
| { |
| "epoch": 5.462212011330399, |
| "grad_norm": 0.7811481952667236, |
| "learning_rate": 2.2688986076895397e-05, |
| "loss": 0.0286, |
| "num_input_tokens_seen": 606204160, |
| "step": 592000 |
| }, |
| { |
| "epoch": 5.466825366069698, |
| "grad_norm": 2.7757558822631836, |
| "learning_rate": 2.26659193031989e-05, |
| "loss": 0.0253, |
| "num_input_tokens_seen": 606716160, |
| "step": 592500 |
| }, |
| { |
| "epoch": 5.471438720808998, |
| "grad_norm": 1.9782260656356812, |
| "learning_rate": 2.2642852529502405e-05, |
| "loss": 0.0277, |
| "num_input_tokens_seen": 607228160, |
| "step": 593000 |
| }, |
| { |
| "epoch": 5.4760520755482975, |
| "grad_norm": 2.8401777744293213, |
| "learning_rate": 2.261978575580591e-05, |
| "loss": 0.0255, |
| "num_input_tokens_seen": 607740160, |
| "step": 593500 |
| }, |
| { |
| "epoch": 5.480665430287597, |
| "grad_norm": 0.5879292488098145, |
| "learning_rate": 2.259671898210941e-05, |
| "loss": 0.027, |
| "num_input_tokens_seen": 608252160, |
| "step": 594000 |
| }, |
| { |
| "epoch": 5.485278785026896, |
| "grad_norm": 1.1103825569152832, |
| "learning_rate": 2.2573652208412917e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 608764160, |
| "step": 594500 |
| }, |
| { |
| "epoch": 5.489892139766195, |
| "grad_norm": 1.002668857574463, |
| "learning_rate": 2.2550585434716417e-05, |
| "loss": 0.0276, |
| "num_input_tokens_seen": 609276160, |
| "step": 595000 |
| }, |
| { |
| "epoch": 5.4945054945054945, |
| "grad_norm": 0.5841794013977051, |
| "learning_rate": 2.252751866101992e-05, |
| "loss": 0.0272, |
| "num_input_tokens_seen": 609788160, |
| "step": 595500 |
| }, |
| { |
| "epoch": 5.499118849244794, |
| "grad_norm": 0.6137141585350037, |
| "learning_rate": 2.2504451887323425e-05, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 610300160, |
| "step": 596000 |
| }, |
| { |
| "epoch": 5.503732203984093, |
| "grad_norm": 0.6018849015235901, |
| "learning_rate": 2.248138511362693e-05, |
| "loss": 0.0279, |
| "num_input_tokens_seen": 610812160, |
| "step": 596500 |
| }, |
| { |
| "epoch": 5.508345558723392, |
| "grad_norm": 1.4851562976837158, |
| "learning_rate": 2.2458318339930433e-05, |
| "loss": 0.0268, |
| "num_input_tokens_seen": 611324160, |
| "step": 597000 |
| }, |
| { |
| "epoch": 5.5129589134626915, |
| "grad_norm": 1.9454591274261475, |
| "learning_rate": 2.2435251566233937e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 611836160, |
| "step": 597500 |
| }, |
| { |
| "epoch": 5.517572268201991, |
| "grad_norm": 1.9615495204925537, |
| "learning_rate": 2.2412184792537438e-05, |
| "loss": 0.0257, |
| "num_input_tokens_seen": 612348160, |
| "step": 598000 |
| }, |
| { |
| "epoch": 5.52218562294129, |
| "grad_norm": 1.1803622245788574, |
| "learning_rate": 2.238911801884094e-05, |
| "loss": 0.0256, |
| "num_input_tokens_seen": 612860160, |
| "step": 598500 |
| }, |
| { |
| "epoch": 5.52679897768059, |
| "grad_norm": 0.7780105471611023, |
| "learning_rate": 2.2366051245144445e-05, |
| "loss": 0.0264, |
| "num_input_tokens_seen": 613372160, |
| "step": 599000 |
| }, |
| { |
| "epoch": 5.531412332419889, |
| "grad_norm": 0.5582423806190491, |
| "learning_rate": 2.2342984471447946e-05, |
| "loss": 0.0279, |
| "num_input_tokens_seen": 613884160, |
| "step": 599500 |
| }, |
| { |
| "epoch": 5.536025687159189, |
| "grad_norm": 1.4547449350357056, |
| "learning_rate": 2.2319917697751453e-05, |
| "loss": 0.0273, |
| "num_input_tokens_seen": 614396160, |
| "step": 600000 |
| }, |
| { |
| "epoch": 5.540639041898488, |
| "grad_norm": 1.0105394124984741, |
| "learning_rate": 2.2296850924054954e-05, |
| "loss": 0.0251, |
| "num_input_tokens_seen": 614908160, |
| "step": 600500 |
| }, |
| { |
| "epoch": 5.545252396637787, |
| "grad_norm": 0.7775139212608337, |
| "learning_rate": 2.2273784150358458e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 615420160, |
| "step": 601000 |
| }, |
| { |
| "epoch": 5.549865751377086, |
| "grad_norm": 0.40573227405548096, |
| "learning_rate": 2.2250717376661962e-05, |
| "loss": 0.0268, |
| "num_input_tokens_seen": 615932160, |
| "step": 601500 |
| }, |
| { |
| "epoch": 5.554479106116386, |
| "grad_norm": 1.130553126335144, |
| "learning_rate": 2.2227650602965466e-05, |
| "loss": 0.0255, |
| "num_input_tokens_seen": 616444160, |
| "step": 602000 |
| }, |
| { |
| "epoch": 5.559092460855685, |
| "grad_norm": 1.0450289249420166, |
| "learning_rate": 2.220458382926897e-05, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 616956160, |
| "step": 602500 |
| }, |
| { |
| "epoch": 5.563705815594984, |
| "grad_norm": 0.7919219136238098, |
| "learning_rate": 2.2181517055572474e-05, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 617468160, |
| "step": 603000 |
| }, |
| { |
| "epoch": 5.568319170334283, |
| "grad_norm": 0.7787536382675171, |
| "learning_rate": 2.2158450281875974e-05, |
| "loss": 0.0305, |
| "num_input_tokens_seen": 617980160, |
| "step": 603500 |
| }, |
| { |
| "epoch": 5.572932525073583, |
| "grad_norm": 1.2866960763931274, |
| "learning_rate": 2.2135383508179482e-05, |
| "loss": 0.028, |
| "num_input_tokens_seen": 618492160, |
| "step": 604000 |
| }, |
| { |
| "epoch": 5.577545879812883, |
| "grad_norm": 1.9128954410552979, |
| "learning_rate": 2.2112316734482982e-05, |
| "loss": 0.0276, |
| "num_input_tokens_seen": 619004160, |
| "step": 604500 |
| }, |
| { |
| "epoch": 5.582159234552182, |
| "grad_norm": 1.13468337059021, |
| "learning_rate": 2.2089249960786483e-05, |
| "loss": 0.027, |
| "num_input_tokens_seen": 619516160, |
| "step": 605000 |
| }, |
| { |
| "epoch": 5.586772589291481, |
| "grad_norm": 1.4375085830688477, |
| "learning_rate": 2.206618318708999e-05, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 620028160, |
| "step": 605500 |
| }, |
| { |
| "epoch": 5.5913859440307805, |
| "grad_norm": 0.722649872303009, |
| "learning_rate": 2.204311641339349e-05, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 620540160, |
| "step": 606000 |
| }, |
| { |
| "epoch": 5.59599929877008, |
| "grad_norm": 0.8669957518577576, |
| "learning_rate": 2.2020049639696995e-05, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 621052160, |
| "step": 606500 |
| }, |
| { |
| "epoch": 5.600612653509379, |
| "grad_norm": 0.8053223490715027, |
| "learning_rate": 2.19969828660005e-05, |
| "loss": 0.0302, |
| "num_input_tokens_seen": 621564160, |
| "step": 607000 |
| }, |
| { |
| "epoch": 5.605226008248678, |
| "grad_norm": 1.0647988319396973, |
| "learning_rate": 2.1973916092304003e-05, |
| "loss": 0.0263, |
| "num_input_tokens_seen": 622076160, |
| "step": 607500 |
| }, |
| { |
| "epoch": 5.6098393629879775, |
| "grad_norm": 1.0449702739715576, |
| "learning_rate": 2.1950849318607507e-05, |
| "loss": 0.0292, |
| "num_input_tokens_seen": 622588160, |
| "step": 608000 |
| }, |
| { |
| "epoch": 5.614452717727277, |
| "grad_norm": 0.8551065921783447, |
| "learning_rate": 2.192778254491101e-05, |
| "loss": 0.026, |
| "num_input_tokens_seen": 623100160, |
| "step": 608500 |
| }, |
| { |
| "epoch": 5.619066072466576, |
| "grad_norm": 0.9317313432693481, |
| "learning_rate": 2.190471577121451e-05, |
| "loss": 0.0273, |
| "num_input_tokens_seen": 623612160, |
| "step": 609000 |
| }, |
| { |
| "epoch": 5.623679427205875, |
| "grad_norm": 1.1779793500900269, |
| "learning_rate": 2.188164899751802e-05, |
| "loss": 0.0267, |
| "num_input_tokens_seen": 624124160, |
| "step": 609500 |
| }, |
| { |
| "epoch": 5.628292781945175, |
| "grad_norm": 0.7221566438674927, |
| "learning_rate": 2.185858222382152e-05, |
| "loss": 0.0263, |
| "num_input_tokens_seen": 624636160, |
| "step": 610000 |
| }, |
| { |
| "epoch": 5.632906136684475, |
| "grad_norm": 1.5405559539794922, |
| "learning_rate": 2.1835515450125023e-05, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 625148160, |
| "step": 610500 |
| }, |
| { |
| "epoch": 5.637519491423774, |
| "grad_norm": 1.2586696147918701, |
| "learning_rate": 2.1812448676428527e-05, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 625660160, |
| "step": 611000 |
| }, |
| { |
| "epoch": 5.642132846163073, |
| "grad_norm": 1.4537557363510132, |
| "learning_rate": 2.1789381902732028e-05, |
| "loss": 0.0254, |
| "num_input_tokens_seen": 626172160, |
| "step": 611500 |
| }, |
| { |
| "epoch": 5.646746200902372, |
| "grad_norm": 0.7319709658622742, |
| "learning_rate": 2.1766315129035532e-05, |
| "loss": 0.0286, |
| "num_input_tokens_seen": 626684160, |
| "step": 612000 |
| }, |
| { |
| "epoch": 5.651359555641672, |
| "grad_norm": 0.6492053866386414, |
| "learning_rate": 2.1743248355339036e-05, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 627196160, |
| "step": 612500 |
| }, |
| { |
| "epoch": 5.655972910380971, |
| "grad_norm": 1.0684195756912231, |
| "learning_rate": 2.172018158164254e-05, |
| "loss": 0.0239, |
| "num_input_tokens_seen": 627708160, |
| "step": 613000 |
| }, |
| { |
| "epoch": 5.66058626512027, |
| "grad_norm": 1.018306851387024, |
| "learning_rate": 2.1697114807946044e-05, |
| "loss": 0.027, |
| "num_input_tokens_seen": 628220160, |
| "step": 613500 |
| }, |
| { |
| "epoch": 5.665199619859569, |
| "grad_norm": 0.5089601278305054, |
| "learning_rate": 2.1674048034249548e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 628732160, |
| "step": 614000 |
| }, |
| { |
| "epoch": 5.669812974598869, |
| "grad_norm": 1.606461763381958, |
| "learning_rate": 2.1650981260553048e-05, |
| "loss": 0.0267, |
| "num_input_tokens_seen": 629244160, |
| "step": 614500 |
| }, |
| { |
| "epoch": 5.674426329338168, |
| "grad_norm": 1.479805588722229, |
| "learning_rate": 2.1627914486856556e-05, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 629756160, |
| "step": 615000 |
| }, |
| { |
| "epoch": 5.679039684077467, |
| "grad_norm": 2.971240758895874, |
| "learning_rate": 2.1604847713160056e-05, |
| "loss": 0.0285, |
| "num_input_tokens_seen": 630268160, |
| "step": 615500 |
| }, |
| { |
| "epoch": 5.683653038816766, |
| "grad_norm": 0.5969455242156982, |
| "learning_rate": 2.158178093946356e-05, |
| "loss": 0.0265, |
| "num_input_tokens_seen": 630780160, |
| "step": 616000 |
| }, |
| { |
| "epoch": 5.6882663935560664, |
| "grad_norm": 0.7076913118362427, |
| "learning_rate": 2.1558714165767064e-05, |
| "loss": 0.024, |
| "num_input_tokens_seen": 631292160, |
| "step": 616500 |
| }, |
| { |
| "epoch": 5.692879748295366, |
| "grad_norm": 0.8780455589294434, |
| "learning_rate": 2.1535647392070568e-05, |
| "loss": 0.0253, |
| "num_input_tokens_seen": 631804160, |
| "step": 617000 |
| }, |
| { |
| "epoch": 5.697493103034665, |
| "grad_norm": 3.569014549255371, |
| "learning_rate": 2.151258061837407e-05, |
| "loss": 0.0252, |
| "num_input_tokens_seen": 632316160, |
| "step": 617500 |
| }, |
| { |
| "epoch": 5.702106457773964, |
| "grad_norm": 0.9523796439170837, |
| "learning_rate": 2.1489513844677573e-05, |
| "loss": 0.0275, |
| "num_input_tokens_seen": 632828160, |
| "step": 618000 |
| }, |
| { |
| "epoch": 5.706719812513263, |
| "grad_norm": 0.6151872873306274, |
| "learning_rate": 2.1466447070981077e-05, |
| "loss": 0.0272, |
| "num_input_tokens_seen": 633340160, |
| "step": 618500 |
| }, |
| { |
| "epoch": 5.711333167252563, |
| "grad_norm": 4.095676422119141, |
| "learning_rate": 2.144338029728458e-05, |
| "loss": 0.0309, |
| "num_input_tokens_seen": 633852160, |
| "step": 619000 |
| }, |
| { |
| "epoch": 5.715946521991862, |
| "grad_norm": 1.5436087846755981, |
| "learning_rate": 2.1420313523588085e-05, |
| "loss": 0.0237, |
| "num_input_tokens_seen": 634364160, |
| "step": 619500 |
| }, |
| { |
| "epoch": 5.720559876731161, |
| "grad_norm": 0.722958505153656, |
| "learning_rate": 2.1397246749891585e-05, |
| "loss": 0.024, |
| "num_input_tokens_seen": 634876160, |
| "step": 620000 |
| }, |
| { |
| "epoch": 5.72517323147046, |
| "grad_norm": 1.9889734983444214, |
| "learning_rate": 2.1374179976195092e-05, |
| "loss": 0.026, |
| "num_input_tokens_seen": 635388160, |
| "step": 620500 |
| }, |
| { |
| "epoch": 5.72978658620976, |
| "grad_norm": 1.8848015069961548, |
| "learning_rate": 2.1351113202498593e-05, |
| "loss": 0.0295, |
| "num_input_tokens_seen": 635900160, |
| "step": 621000 |
| }, |
| { |
| "epoch": 5.734399940949059, |
| "grad_norm": 1.4463508129119873, |
| "learning_rate": 2.1328046428802097e-05, |
| "loss": 0.0286, |
| "num_input_tokens_seen": 636412160, |
| "step": 621500 |
| }, |
| { |
| "epoch": 5.739013295688359, |
| "grad_norm": 2.2826876640319824, |
| "learning_rate": 2.13049796551056e-05, |
| "loss": 0.0278, |
| "num_input_tokens_seen": 636924160, |
| "step": 622000 |
| }, |
| { |
| "epoch": 5.743626650427658, |
| "grad_norm": 0.8323870897293091, |
| "learning_rate": 2.1281912881409105e-05, |
| "loss": 0.0247, |
| "num_input_tokens_seen": 637436160, |
| "step": 622500 |
| }, |
| { |
| "epoch": 5.7482400051669575, |
| "grad_norm": 1.4278696775436401, |
| "learning_rate": 2.1258846107712606e-05, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 637948160, |
| "step": 623000 |
| }, |
| { |
| "epoch": 5.752853359906257, |
| "grad_norm": 0.425340473651886, |
| "learning_rate": 2.1235779334016113e-05, |
| "loss": 0.0263, |
| "num_input_tokens_seen": 638460160, |
| "step": 623500 |
| }, |
| { |
| "epoch": 5.757466714645556, |
| "grad_norm": 0.6665620803833008, |
| "learning_rate": 2.1212712560319614e-05, |
| "loss": 0.0286, |
| "num_input_tokens_seen": 638972160, |
| "step": 624000 |
| }, |
| { |
| "epoch": 5.762080069384855, |
| "grad_norm": 1.1083565950393677, |
| "learning_rate": 2.1189645786623117e-05, |
| "loss": 0.0251, |
| "num_input_tokens_seen": 639484160, |
| "step": 624500 |
| }, |
| { |
| "epoch": 5.7666934241241545, |
| "grad_norm": 1.5361641645431519, |
| "learning_rate": 2.116657901292662e-05, |
| "loss": 0.0238, |
| "num_input_tokens_seen": 639996160, |
| "step": 625000 |
| }, |
| { |
| "epoch": 5.771306778863454, |
| "grad_norm": 1.897976040840149, |
| "learning_rate": 2.1143512239230122e-05, |
| "loss": 0.0252, |
| "num_input_tokens_seen": 640508160, |
| "step": 625500 |
| }, |
| { |
| "epoch": 5.775920133602753, |
| "grad_norm": 1.181335687637329, |
| "learning_rate": 2.112044546553363e-05, |
| "loss": 0.0274, |
| "num_input_tokens_seen": 641020160, |
| "step": 626000 |
| }, |
| { |
| "epoch": 5.780533488342052, |
| "grad_norm": 1.2350566387176514, |
| "learning_rate": 2.109737869183713e-05, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 641532160, |
| "step": 626500 |
| }, |
| { |
| "epoch": 5.7851468430813515, |
| "grad_norm": 0.9288113713264465, |
| "learning_rate": 2.1074311918140634e-05, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 642044160, |
| "step": 627000 |
| }, |
| { |
| "epoch": 5.789760197820652, |
| "grad_norm": 1.3695634603500366, |
| "learning_rate": 2.1051245144444138e-05, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 642556160, |
| "step": 627500 |
| }, |
| { |
| "epoch": 5.794373552559951, |
| "grad_norm": 1.5921497344970703, |
| "learning_rate": 2.1028178370747642e-05, |
| "loss": 0.0271, |
| "num_input_tokens_seen": 643068160, |
| "step": 628000 |
| }, |
| { |
| "epoch": 5.79898690729925, |
| "grad_norm": 0.9547250866889954, |
| "learning_rate": 2.1005111597051146e-05, |
| "loss": 0.0246, |
| "num_input_tokens_seen": 643580160, |
| "step": 628500 |
| }, |
| { |
| "epoch": 5.803600262038549, |
| "grad_norm": 0.702260434627533, |
| "learning_rate": 2.098204482335465e-05, |
| "loss": 0.0286, |
| "num_input_tokens_seen": 644092160, |
| "step": 629000 |
| }, |
| { |
| "epoch": 5.808213616777849, |
| "grad_norm": 1.7382519245147705, |
| "learning_rate": 2.095897804965815e-05, |
| "loss": 0.0247, |
| "num_input_tokens_seen": 644604160, |
| "step": 629500 |
| }, |
| { |
| "epoch": 5.812826971517148, |
| "grad_norm": 0.724609911441803, |
| "learning_rate": 2.0935911275961654e-05, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 645116160, |
| "step": 630000 |
| }, |
| { |
| "epoch": 5.817440326256447, |
| "grad_norm": 0.8976930379867554, |
| "learning_rate": 2.091284450226516e-05, |
| "loss": 0.0261, |
| "num_input_tokens_seen": 645628160, |
| "step": 630500 |
| }, |
| { |
| "epoch": 5.822053680995746, |
| "grad_norm": 2.6822431087493896, |
| "learning_rate": 2.088977772856866e-05, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 646140160, |
| "step": 631000 |
| }, |
| { |
| "epoch": 5.826667035735046, |
| "grad_norm": 0.9543342590332031, |
| "learning_rate": 2.0866710954872166e-05, |
| "loss": 0.0255, |
| "num_input_tokens_seen": 646652160, |
| "step": 631500 |
| }, |
| { |
| "epoch": 5.831280390474345, |
| "grad_norm": 1.0366599559783936, |
| "learning_rate": 2.0843644181175667e-05, |
| "loss": 0.0265, |
| "num_input_tokens_seen": 647164160, |
| "step": 632000 |
| }, |
| { |
| "epoch": 5.835893745213644, |
| "grad_norm": 2.613006830215454, |
| "learning_rate": 2.082057740747917e-05, |
| "loss": 0.0264, |
| "num_input_tokens_seen": 647676160, |
| "step": 632500 |
| }, |
| { |
| "epoch": 5.840507099952944, |
| "grad_norm": 0.2824631631374359, |
| "learning_rate": 2.0797510633782675e-05, |
| "loss": 0.0288, |
| "num_input_tokens_seen": 648188160, |
| "step": 633000 |
| }, |
| { |
| "epoch": 5.845120454692243, |
| "grad_norm": 3.399728298187256, |
| "learning_rate": 2.077444386008618e-05, |
| "loss": 0.0268, |
| "num_input_tokens_seen": 648700160, |
| "step": 633500 |
| }, |
| { |
| "epoch": 5.849733809431543, |
| "grad_norm": 0.7402966022491455, |
| "learning_rate": 2.0751377086389683e-05, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 649212160, |
| "step": 634000 |
| }, |
| { |
| "epoch": 5.854347164170842, |
| "grad_norm": 0.7553480267524719, |
| "learning_rate": 2.0728310312693187e-05, |
| "loss": 0.0277, |
| "num_input_tokens_seen": 649724160, |
| "step": 634500 |
| }, |
| { |
| "epoch": 5.858960518910141, |
| "grad_norm": 3.4398159980773926, |
| "learning_rate": 2.0705243538996687e-05, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 650236160, |
| "step": 635000 |
| }, |
| { |
| "epoch": 5.8635738736494405, |
| "grad_norm": 0.5711115598678589, |
| "learning_rate": 2.0682176765300195e-05, |
| "loss": 0.0241, |
| "num_input_tokens_seen": 650748160, |
| "step": 635500 |
| }, |
| { |
| "epoch": 5.86818722838874, |
| "grad_norm": 0.7952388525009155, |
| "learning_rate": 2.0659109991603695e-05, |
| "loss": 0.0275, |
| "num_input_tokens_seen": 651260160, |
| "step": 636000 |
| }, |
| { |
| "epoch": 5.872800583128039, |
| "grad_norm": 1.0399372577667236, |
| "learning_rate": 2.06360432179072e-05, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 651772160, |
| "step": 636500 |
| }, |
| { |
| "epoch": 5.877413937867338, |
| "grad_norm": 1.6778496503829956, |
| "learning_rate": 2.0612976444210703e-05, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 652284160, |
| "step": 637000 |
| }, |
| { |
| "epoch": 5.8820272926066375, |
| "grad_norm": 1.3442925214767456, |
| "learning_rate": 2.0589909670514204e-05, |
| "loss": 0.0271, |
| "num_input_tokens_seen": 652796160, |
| "step": 637500 |
| }, |
| { |
| "epoch": 5.886640647345937, |
| "grad_norm": 1.1822031736373901, |
| "learning_rate": 2.0566842896817708e-05, |
| "loss": 0.0256, |
| "num_input_tokens_seen": 653308160, |
| "step": 638000 |
| }, |
| { |
| "epoch": 5.891254002085236, |
| "grad_norm": 1.5322853326797485, |
| "learning_rate": 2.0543776123121212e-05, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 653820160, |
| "step": 638500 |
| }, |
| { |
| "epoch": 5.895867356824535, |
| "grad_norm": 1.6025440692901611, |
| "learning_rate": 2.0520709349424716e-05, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 654332160, |
| "step": 639000 |
| }, |
| { |
| "epoch": 5.900480711563835, |
| "grad_norm": 0.7516422867774963, |
| "learning_rate": 2.049764257572822e-05, |
| "loss": 0.0293, |
| "num_input_tokens_seen": 654844160, |
| "step": 639500 |
| }, |
| { |
| "epoch": 5.905094066303135, |
| "grad_norm": 0.7684640884399414, |
| "learning_rate": 2.0474575802031724e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 655356160, |
| "step": 640000 |
| }, |
| { |
| "epoch": 5.909707421042434, |
| "grad_norm": 1.2843828201293945, |
| "learning_rate": 2.0451509028335224e-05, |
| "loss": 0.0252, |
| "num_input_tokens_seen": 655868160, |
| "step": 640500 |
| }, |
| { |
| "epoch": 5.914320775781733, |
| "grad_norm": 1.0203999280929565, |
| "learning_rate": 2.042844225463873e-05, |
| "loss": 0.0268, |
| "num_input_tokens_seen": 656380160, |
| "step": 641000 |
| }, |
| { |
| "epoch": 5.918934130521032, |
| "grad_norm": 2.00242280960083, |
| "learning_rate": 2.0405375480942232e-05, |
| "loss": 0.0285, |
| "num_input_tokens_seen": 656892160, |
| "step": 641500 |
| }, |
| { |
| "epoch": 5.923547485260332, |
| "grad_norm": 1.0357120037078857, |
| "learning_rate": 2.0382308707245736e-05, |
| "loss": 0.0239, |
| "num_input_tokens_seen": 657404160, |
| "step": 642000 |
| }, |
| { |
| "epoch": 5.928160839999631, |
| "grad_norm": 1.1826400756835938, |
| "learning_rate": 2.035924193354924e-05, |
| "loss": 0.0268, |
| "num_input_tokens_seen": 657916160, |
| "step": 642500 |
| }, |
| { |
| "epoch": 5.93277419473893, |
| "grad_norm": 1.5662238597869873, |
| "learning_rate": 2.0336175159852744e-05, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 658428160, |
| "step": 643000 |
| }, |
| { |
| "epoch": 5.937387549478229, |
| "grad_norm": 3.335893392562866, |
| "learning_rate": 2.0313108386156245e-05, |
| "loss": 0.0273, |
| "num_input_tokens_seen": 658940160, |
| "step": 643500 |
| }, |
| { |
| "epoch": 5.942000904217529, |
| "grad_norm": 0.7126489281654358, |
| "learning_rate": 2.029004161245975e-05, |
| "loss": 0.0268, |
| "num_input_tokens_seen": 659452160, |
| "step": 644000 |
| }, |
| { |
| "epoch": 5.946614258956828, |
| "grad_norm": 1.0062040090560913, |
| "learning_rate": 2.0266974838763253e-05, |
| "loss": 0.0276, |
| "num_input_tokens_seen": 659964160, |
| "step": 644500 |
| }, |
| { |
| "epoch": 5.951227613696128, |
| "grad_norm": 1.2691099643707275, |
| "learning_rate": 2.0243908065066757e-05, |
| "loss": 0.0295, |
| "num_input_tokens_seen": 660476160, |
| "step": 645000 |
| }, |
| { |
| "epoch": 5.955840968435427, |
| "grad_norm": 0.9768707752227783, |
| "learning_rate": 2.022084129137026e-05, |
| "loss": 0.0287, |
| "num_input_tokens_seen": 660988160, |
| "step": 645500 |
| }, |
| { |
| "epoch": 5.9604543231747265, |
| "grad_norm": 1.5846303701400757, |
| "learning_rate": 2.019777451767376e-05, |
| "loss": 0.028, |
| "num_input_tokens_seen": 661500160, |
| "step": 646000 |
| }, |
| { |
| "epoch": 5.965067677914026, |
| "grad_norm": 0.556376576423645, |
| "learning_rate": 2.017470774397727e-05, |
| "loss": 0.029, |
| "num_input_tokens_seen": 662012160, |
| "step": 646500 |
| }, |
| { |
| "epoch": 5.969681032653325, |
| "grad_norm": 1.8407984972000122, |
| "learning_rate": 2.015164097028077e-05, |
| "loss": 0.0278, |
| "num_input_tokens_seen": 662524160, |
| "step": 647000 |
| }, |
| { |
| "epoch": 5.974294387392624, |
| "grad_norm": 2.419261932373047, |
| "learning_rate": 2.0128574196584273e-05, |
| "loss": 0.0264, |
| "num_input_tokens_seen": 663036160, |
| "step": 647500 |
| }, |
| { |
| "epoch": 5.978907742131923, |
| "grad_norm": 1.3140838146209717, |
| "learning_rate": 2.0105507422887777e-05, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 663548160, |
| "step": 648000 |
| }, |
| { |
| "epoch": 5.983521096871223, |
| "grad_norm": 1.3511277437210083, |
| "learning_rate": 2.008244064919128e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 664060160, |
| "step": 648500 |
| }, |
| { |
| "epoch": 5.988134451610522, |
| "grad_norm": 0.9623832106590271, |
| "learning_rate": 2.005937387549478e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 664572160, |
| "step": 649000 |
| }, |
| { |
| "epoch": 5.992747806349821, |
| "grad_norm": 1.2604849338531494, |
| "learning_rate": 2.003630710179829e-05, |
| "loss": 0.0257, |
| "num_input_tokens_seen": 665084160, |
| "step": 649500 |
| }, |
| { |
| "epoch": 5.99736116108912, |
| "grad_norm": 0.5637773871421814, |
| "learning_rate": 2.001324032810179e-05, |
| "loss": 0.0276, |
| "num_input_tokens_seen": 665596160, |
| "step": 650000 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_combined_score": 0.06719425867896905, |
| "eval_loss": 0.06719426065683365, |
| "eval_mse": 0.06719425670110447, |
| "eval_runtime": 46.0502, |
| "eval_samples_per_second": 2092.023, |
| "eval_steps_per_second": 261.519, |
| "num_input_tokens_seen": 665888256, |
| "step": 650286 |
| }, |
| { |
| "epoch": 6.00197451582842, |
| "grad_norm": 0.7754026055335999, |
| "learning_rate": 1.9990173554405293e-05, |
| "loss": 0.0243, |
| "num_input_tokens_seen": 666107392, |
| "step": 650500 |
| }, |
| { |
| "epoch": 6.00658787056772, |
| "grad_norm": 3.4056851863861084, |
| "learning_rate": 1.9967106780708797e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 666619392, |
| "step": 651000 |
| }, |
| { |
| "epoch": 6.011201225307019, |
| "grad_norm": 0.7338670492172241, |
| "learning_rate": 1.9944040007012298e-05, |
| "loss": 0.0194, |
| "num_input_tokens_seen": 667131392, |
| "step": 651500 |
| }, |
| { |
| "epoch": 6.015814580046318, |
| "grad_norm": 0.9775220155715942, |
| "learning_rate": 1.9920973233315805e-05, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 667643392, |
| "step": 652000 |
| }, |
| { |
| "epoch": 6.0204279347856176, |
| "grad_norm": 0.6513090133666992, |
| "learning_rate": 1.9897906459619306e-05, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 668155392, |
| "step": 652500 |
| }, |
| { |
| "epoch": 6.025041289524917, |
| "grad_norm": 1.0997514724731445, |
| "learning_rate": 1.987483968592281e-05, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 668667392, |
| "step": 653000 |
| }, |
| { |
| "epoch": 6.029654644264216, |
| "grad_norm": 1.8776363134384155, |
| "learning_rate": 1.9851772912226314e-05, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 669179392, |
| "step": 653500 |
| }, |
| { |
| "epoch": 6.034267999003515, |
| "grad_norm": 1.0117559432983398, |
| "learning_rate": 1.9828706138529818e-05, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 669691392, |
| "step": 654000 |
| }, |
| { |
| "epoch": 6.0388813537428145, |
| "grad_norm": 1.839374303817749, |
| "learning_rate": 1.980563936483332e-05, |
| "loss": 0.0206, |
| "num_input_tokens_seen": 670203392, |
| "step": 654500 |
| }, |
| { |
| "epoch": 6.043494708482114, |
| "grad_norm": 1.1383150815963745, |
| "learning_rate": 1.9782572591136826e-05, |
| "loss": 0.02, |
| "num_input_tokens_seen": 670715392, |
| "step": 655000 |
| }, |
| { |
| "epoch": 6.048108063221413, |
| "grad_norm": 0.6940335631370544, |
| "learning_rate": 1.9759505817440326e-05, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 671227392, |
| "step": 655500 |
| }, |
| { |
| "epoch": 6.052721417960712, |
| "grad_norm": 0.9437240958213806, |
| "learning_rate": 1.973643904374383e-05, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 671739392, |
| "step": 656000 |
| }, |
| { |
| "epoch": 6.057334772700012, |
| "grad_norm": 1.297887921333313, |
| "learning_rate": 1.9713372270047334e-05, |
| "loss": 0.0197, |
| "num_input_tokens_seen": 672251392, |
| "step": 656500 |
| }, |
| { |
| "epoch": 6.061948127439312, |
| "grad_norm": 1.1121424436569214, |
| "learning_rate": 1.9690305496350835e-05, |
| "loss": 0.0228, |
| "num_input_tokens_seen": 672763392, |
| "step": 657000 |
| }, |
| { |
| "epoch": 6.066561482178611, |
| "grad_norm": 1.2576148509979248, |
| "learning_rate": 1.9667238722654342e-05, |
| "loss": 0.0202, |
| "num_input_tokens_seen": 673275392, |
| "step": 657500 |
| }, |
| { |
| "epoch": 6.07117483691791, |
| "grad_norm": 0.9484318494796753, |
| "learning_rate": 1.9644171948957843e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 673787392, |
| "step": 658000 |
| }, |
| { |
| "epoch": 6.075788191657209, |
| "grad_norm": 1.5170820951461792, |
| "learning_rate": 1.9621105175261347e-05, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 674299392, |
| "step": 658500 |
| }, |
| { |
| "epoch": 6.080401546396509, |
| "grad_norm": 1.5162551403045654, |
| "learning_rate": 1.959803840156485e-05, |
| "loss": 0.0209, |
| "num_input_tokens_seen": 674811392, |
| "step": 659000 |
| }, |
| { |
| "epoch": 6.085014901135808, |
| "grad_norm": 1.1097129583358765, |
| "learning_rate": 1.9574971627868355e-05, |
| "loss": 0.0211, |
| "num_input_tokens_seen": 675323392, |
| "step": 659500 |
| }, |
| { |
| "epoch": 6.089628255875107, |
| "grad_norm": 1.9856687784194946, |
| "learning_rate": 1.9551904854171855e-05, |
| "loss": 0.0205, |
| "num_input_tokens_seen": 675835392, |
| "step": 660000 |
| }, |
| { |
| "epoch": 6.094241610614406, |
| "grad_norm": 0.447665810585022, |
| "learning_rate": 1.9528838080475363e-05, |
| "loss": 0.0215, |
| "num_input_tokens_seen": 676347392, |
| "step": 660500 |
| }, |
| { |
| "epoch": 6.098854965353706, |
| "grad_norm": 0.6140983700752258, |
| "learning_rate": 1.9505771306778863e-05, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 676859392, |
| "step": 661000 |
| }, |
| { |
| "epoch": 6.103468320093005, |
| "grad_norm": 0.6753659844398499, |
| "learning_rate": 1.9482704533082367e-05, |
| "loss": 0.0234, |
| "num_input_tokens_seen": 677371392, |
| "step": 661500 |
| }, |
| { |
| "epoch": 6.108081674832304, |
| "grad_norm": 0.5752419233322144, |
| "learning_rate": 1.945963775938587e-05, |
| "loss": 0.0202, |
| "num_input_tokens_seen": 677883392, |
| "step": 662000 |
| }, |
| { |
| "epoch": 6.112695029571604, |
| "grad_norm": 0.8498187065124512, |
| "learning_rate": 1.9436570985689375e-05, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 678395392, |
| "step": 662500 |
| }, |
| { |
| "epoch": 6.1173083843109035, |
| "grad_norm": 0.8756592273712158, |
| "learning_rate": 1.941350421199288e-05, |
| "loss": 0.0193, |
| "num_input_tokens_seen": 678907392, |
| "step": 663000 |
| }, |
| { |
| "epoch": 6.121921739050203, |
| "grad_norm": 2.693408250808716, |
| "learning_rate": 1.939043743829638e-05, |
| "loss": 0.0192, |
| "num_input_tokens_seen": 679419392, |
| "step": 663500 |
| }, |
| { |
| "epoch": 6.126535093789502, |
| "grad_norm": 1.2562410831451416, |
| "learning_rate": 1.9367370664599884e-05, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 679931392, |
| "step": 664000 |
| }, |
| { |
| "epoch": 6.131148448528801, |
| "grad_norm": 1.662607192993164, |
| "learning_rate": 1.9344303890903388e-05, |
| "loss": 0.0202, |
| "num_input_tokens_seen": 680443392, |
| "step": 664500 |
| }, |
| { |
| "epoch": 6.1357618032681005, |
| "grad_norm": 0.8095691800117493, |
| "learning_rate": 1.932123711720689e-05, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 680955392, |
| "step": 665000 |
| }, |
| { |
| "epoch": 6.1403751580074, |
| "grad_norm": 0.5978444218635559, |
| "learning_rate": 1.9298170343510392e-05, |
| "loss": 0.0211, |
| "num_input_tokens_seen": 681467392, |
| "step": 665500 |
| }, |
| { |
| "epoch": 6.144988512746699, |
| "grad_norm": 0.5060915946960449, |
| "learning_rate": 1.92751035698139e-05, |
| "loss": 0.0227, |
| "num_input_tokens_seen": 681979392, |
| "step": 666000 |
| }, |
| { |
| "epoch": 6.149601867485998, |
| "grad_norm": 0.9484182596206665, |
| "learning_rate": 1.92520367961174e-05, |
| "loss": 0.0203, |
| "num_input_tokens_seen": 682491392, |
| "step": 666500 |
| }, |
| { |
| "epoch": 6.1542152222252975, |
| "grad_norm": 1.3608324527740479, |
| "learning_rate": 1.9228970022420904e-05, |
| "loss": 0.0222, |
| "num_input_tokens_seen": 683003392, |
| "step": 667000 |
| }, |
| { |
| "epoch": 6.158828576964597, |
| "grad_norm": 0.9933167099952698, |
| "learning_rate": 1.9205903248724408e-05, |
| "loss": 0.0193, |
| "num_input_tokens_seen": 683515392, |
| "step": 667500 |
| }, |
| { |
| "epoch": 6.163441931703897, |
| "grad_norm": 1.8458038568496704, |
| "learning_rate": 1.9182836475027912e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 684027392, |
| "step": 668000 |
| }, |
| { |
| "epoch": 6.168055286443196, |
| "grad_norm": 0.9922088384628296, |
| "learning_rate": 1.9159769701331416e-05, |
| "loss": 0.0227, |
| "num_input_tokens_seen": 684539392, |
| "step": 668500 |
| }, |
| { |
| "epoch": 6.172668641182495, |
| "grad_norm": 0.7523616552352905, |
| "learning_rate": 1.913670292763492e-05, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 685051392, |
| "step": 669000 |
| }, |
| { |
| "epoch": 6.177281995921795, |
| "grad_norm": 1.4571471214294434, |
| "learning_rate": 1.911363615393842e-05, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 685563392, |
| "step": 669500 |
| }, |
| { |
| "epoch": 6.181895350661094, |
| "grad_norm": 1.6645666360855103, |
| "learning_rate": 1.9090569380241925e-05, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 686075392, |
| "step": 670000 |
| }, |
| { |
| "epoch": 6.186508705400393, |
| "grad_norm": 0.5746430158615112, |
| "learning_rate": 1.906750260654543e-05, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 686587392, |
| "step": 670500 |
| }, |
| { |
| "epoch": 6.191122060139692, |
| "grad_norm": 0.6545117497444153, |
| "learning_rate": 1.9044435832848933e-05, |
| "loss": 0.0203, |
| "num_input_tokens_seen": 687099392, |
| "step": 671000 |
| }, |
| { |
| "epoch": 6.195735414878992, |
| "grad_norm": 0.6282312273979187, |
| "learning_rate": 1.9021369059152436e-05, |
| "loss": 0.0222, |
| "num_input_tokens_seen": 687611392, |
| "step": 671500 |
| }, |
| { |
| "epoch": 6.200348769618291, |
| "grad_norm": 0.7718172073364258, |
| "learning_rate": 1.8998302285455937e-05, |
| "loss": 0.0211, |
| "num_input_tokens_seen": 688123392, |
| "step": 672000 |
| }, |
| { |
| "epoch": 6.20496212435759, |
| "grad_norm": 1.4277899265289307, |
| "learning_rate": 1.897523551175944e-05, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 688635392, |
| "step": 672500 |
| }, |
| { |
| "epoch": 6.209575479096889, |
| "grad_norm": 0.5869673490524292, |
| "learning_rate": 1.8952168738062945e-05, |
| "loss": 0.0201, |
| "num_input_tokens_seen": 689147392, |
| "step": 673000 |
| }, |
| { |
| "epoch": 6.214188833836189, |
| "grad_norm": 0.7148327231407166, |
| "learning_rate": 1.892910196436645e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 689659392, |
| "step": 673500 |
| }, |
| { |
| "epoch": 6.218802188575489, |
| "grad_norm": 1.9917762279510498, |
| "learning_rate": 1.8906035190669953e-05, |
| "loss": 0.0201, |
| "num_input_tokens_seen": 690171392, |
| "step": 674000 |
| }, |
| { |
| "epoch": 6.223415543314788, |
| "grad_norm": 1.030920386314392, |
| "learning_rate": 1.8882968416973457e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 690683392, |
| "step": 674500 |
| }, |
| { |
| "epoch": 6.228028898054087, |
| "grad_norm": 0.6258344054222107, |
| "learning_rate": 1.8859901643276958e-05, |
| "loss": 0.0205, |
| "num_input_tokens_seen": 691195392, |
| "step": 675000 |
| }, |
| { |
| "epoch": 6.2326422527933865, |
| "grad_norm": 2.0319483280181885, |
| "learning_rate": 1.8836834869580465e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 691707392, |
| "step": 675500 |
| }, |
| { |
| "epoch": 6.237255607532686, |
| "grad_norm": 0.5357654094696045, |
| "learning_rate": 1.8813768095883965e-05, |
| "loss": 0.021, |
| "num_input_tokens_seen": 692219392, |
| "step": 676000 |
| }, |
| { |
| "epoch": 6.241868962271985, |
| "grad_norm": 2.2843759059906006, |
| "learning_rate": 1.879070132218747e-05, |
| "loss": 0.0224, |
| "num_input_tokens_seen": 692731392, |
| "step": 676500 |
| }, |
| { |
| "epoch": 6.246482317011284, |
| "grad_norm": 0.7464880347251892, |
| "learning_rate": 1.8767634548490973e-05, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 693243392, |
| "step": 677000 |
| }, |
| { |
| "epoch": 6.2510956717505834, |
| "grad_norm": 1.1594797372817993, |
| "learning_rate": 1.8744567774794474e-05, |
| "loss": 0.0219, |
| "num_input_tokens_seen": 693755392, |
| "step": 677500 |
| }, |
| { |
| "epoch": 6.255709026489883, |
| "grad_norm": 2.049744129180908, |
| "learning_rate": 1.872150100109798e-05, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 694267392, |
| "step": 678000 |
| }, |
| { |
| "epoch": 6.260322381229182, |
| "grad_norm": 2.227196216583252, |
| "learning_rate": 1.8698434227401482e-05, |
| "loss": 0.0215, |
| "num_input_tokens_seen": 694779392, |
| "step": 678500 |
| }, |
| { |
| "epoch": 6.264935735968481, |
| "grad_norm": 1.209151268005371, |
| "learning_rate": 1.8675367453704986e-05, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 695291392, |
| "step": 679000 |
| }, |
| { |
| "epoch": 6.26954909070778, |
| "grad_norm": 0.6479954123497009, |
| "learning_rate": 1.865230068000849e-05, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 695803392, |
| "step": 679500 |
| }, |
| { |
| "epoch": 6.274162445447081, |
| "grad_norm": 0.5225302577018738, |
| "learning_rate": 1.8629233906311994e-05, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 696315392, |
| "step": 680000 |
| }, |
| { |
| "epoch": 6.27877580018638, |
| "grad_norm": 0.8142069578170776, |
| "learning_rate": 1.8606167132615494e-05, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 696827392, |
| "step": 680500 |
| }, |
| { |
| "epoch": 6.283389154925679, |
| "grad_norm": 2.5518014430999756, |
| "learning_rate": 1.8583100358919002e-05, |
| "loss": 0.0221, |
| "num_input_tokens_seen": 697339392, |
| "step": 681000 |
| }, |
| { |
| "epoch": 6.288002509664978, |
| "grad_norm": 0.609211266040802, |
| "learning_rate": 1.8560033585222502e-05, |
| "loss": 0.0238, |
| "num_input_tokens_seen": 697851392, |
| "step": 681500 |
| }, |
| { |
| "epoch": 6.292615864404278, |
| "grad_norm": 0.6666821837425232, |
| "learning_rate": 1.8536966811526006e-05, |
| "loss": 0.0235, |
| "num_input_tokens_seen": 698363392, |
| "step": 682000 |
| }, |
| { |
| "epoch": 6.297229219143577, |
| "grad_norm": 2.551591396331787, |
| "learning_rate": 1.851390003782951e-05, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 698875392, |
| "step": 682500 |
| }, |
| { |
| "epoch": 6.301842573882876, |
| "grad_norm": 1.171808123588562, |
| "learning_rate": 1.849083326413301e-05, |
| "loss": 0.0211, |
| "num_input_tokens_seen": 699387392, |
| "step": 683000 |
| }, |
| { |
| "epoch": 6.306455928622175, |
| "grad_norm": 1.9758840799331665, |
| "learning_rate": 1.8467766490436518e-05, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 699899392, |
| "step": 683500 |
| }, |
| { |
| "epoch": 6.3110692833614745, |
| "grad_norm": 0.7469502091407776, |
| "learning_rate": 1.844469971674002e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 700411392, |
| "step": 684000 |
| }, |
| { |
| "epoch": 6.315682638100774, |
| "grad_norm": 0.9809781908988953, |
| "learning_rate": 1.8421632943043523e-05, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 700923392, |
| "step": 684500 |
| }, |
| { |
| "epoch": 6.320295992840073, |
| "grad_norm": 0.9586873650550842, |
| "learning_rate": 1.8398566169347027e-05, |
| "loss": 0.021, |
| "num_input_tokens_seen": 701435392, |
| "step": 685000 |
| }, |
| { |
| "epoch": 6.324909347579373, |
| "grad_norm": 8.868587493896484, |
| "learning_rate": 1.837549939565053e-05, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 701947392, |
| "step": 685500 |
| }, |
| { |
| "epoch": 6.329522702318672, |
| "grad_norm": 1.1265676021575928, |
| "learning_rate": 1.835243262195403e-05, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 702459392, |
| "step": 686000 |
| }, |
| { |
| "epoch": 6.334136057057972, |
| "grad_norm": 1.0341181755065918, |
| "learning_rate": 1.832936584825754e-05, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 702971392, |
| "step": 686500 |
| }, |
| { |
| "epoch": 6.338749411797271, |
| "grad_norm": 0.3800777196884155, |
| "learning_rate": 1.830629907456104e-05, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 703483392, |
| "step": 687000 |
| }, |
| { |
| "epoch": 6.34336276653657, |
| "grad_norm": 0.7369467616081238, |
| "learning_rate": 1.8283232300864543e-05, |
| "loss": 0.0234, |
| "num_input_tokens_seen": 703995392, |
| "step": 687500 |
| }, |
| { |
| "epoch": 6.347976121275869, |
| "grad_norm": 1.0980653762817383, |
| "learning_rate": 1.8260165527168047e-05, |
| "loss": 0.02, |
| "num_input_tokens_seen": 704507392, |
| "step": 688000 |
| }, |
| { |
| "epoch": 6.352589476015169, |
| "grad_norm": 17.581872940063477, |
| "learning_rate": 1.823709875347155e-05, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 705019392, |
| "step": 688500 |
| }, |
| { |
| "epoch": 6.357202830754468, |
| "grad_norm": 0.5301328301429749, |
| "learning_rate": 1.8214031979775055e-05, |
| "loss": 0.0226, |
| "num_input_tokens_seen": 705531392, |
| "step": 689000 |
| }, |
| { |
| "epoch": 6.361816185493767, |
| "grad_norm": 0.44786104559898376, |
| "learning_rate": 1.8190965206078556e-05, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 706043392, |
| "step": 689500 |
| }, |
| { |
| "epoch": 6.366429540233066, |
| "grad_norm": 2.587684154510498, |
| "learning_rate": 1.816789843238206e-05, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 706555392, |
| "step": 690000 |
| }, |
| { |
| "epoch": 6.371042894972366, |
| "grad_norm": 1.0485097169876099, |
| "learning_rate": 1.8144831658685564e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 707067392, |
| "step": 690500 |
| }, |
| { |
| "epoch": 6.375656249711666, |
| "grad_norm": 0.38697299361228943, |
| "learning_rate": 1.8121764884989068e-05, |
| "loss": 0.0224, |
| "num_input_tokens_seen": 707579392, |
| "step": 691000 |
| }, |
| { |
| "epoch": 6.380269604450965, |
| "grad_norm": 1.7703328132629395, |
| "learning_rate": 1.8098698111292568e-05, |
| "loss": 0.0205, |
| "num_input_tokens_seen": 708091392, |
| "step": 691500 |
| }, |
| { |
| "epoch": 6.384882959190264, |
| "grad_norm": 0.5361246466636658, |
| "learning_rate": 1.8075631337596076e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 708603392, |
| "step": 692000 |
| }, |
| { |
| "epoch": 6.3894963139295635, |
| "grad_norm": 0.7262565493583679, |
| "learning_rate": 1.8052564563899576e-05, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 709115392, |
| "step": 692500 |
| }, |
| { |
| "epoch": 6.394109668668863, |
| "grad_norm": 0.5426166653633118, |
| "learning_rate": 1.802949779020308e-05, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 709627392, |
| "step": 693000 |
| }, |
| { |
| "epoch": 6.398723023408162, |
| "grad_norm": 0.9370472431182861, |
| "learning_rate": 1.8006431016506584e-05, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 710139392, |
| "step": 693500 |
| }, |
| { |
| "epoch": 6.403336378147461, |
| "grad_norm": 1.1743369102478027, |
| "learning_rate": 1.7983364242810088e-05, |
| "loss": 0.0221, |
| "num_input_tokens_seen": 710651392, |
| "step": 694000 |
| }, |
| { |
| "epoch": 6.4079497328867605, |
| "grad_norm": 1.1654258966445923, |
| "learning_rate": 1.7960297469113592e-05, |
| "loss": 0.0239, |
| "num_input_tokens_seen": 711163392, |
| "step": 694500 |
| }, |
| { |
| "epoch": 6.41256308762606, |
| "grad_norm": 0.9082449078559875, |
| "learning_rate": 1.7937230695417096e-05, |
| "loss": 0.0206, |
| "num_input_tokens_seen": 711675392, |
| "step": 695000 |
| }, |
| { |
| "epoch": 6.417176442365359, |
| "grad_norm": 0.7706845998764038, |
| "learning_rate": 1.7914163921720597e-05, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 712187392, |
| "step": 695500 |
| }, |
| { |
| "epoch": 6.421789797104658, |
| "grad_norm": 0.8697851896286011, |
| "learning_rate": 1.78910971480241e-05, |
| "loss": 0.0196, |
| "num_input_tokens_seen": 712699392, |
| "step": 696000 |
| }, |
| { |
| "epoch": 6.4264031518439575, |
| "grad_norm": 0.8328973054885864, |
| "learning_rate": 1.7868030374327605e-05, |
| "loss": 0.0191, |
| "num_input_tokens_seen": 713211392, |
| "step": 696500 |
| }, |
| { |
| "epoch": 6.431016506583257, |
| "grad_norm": 7.328830242156982, |
| "learning_rate": 1.7844963600631105e-05, |
| "loss": 0.0219, |
| "num_input_tokens_seen": 713723392, |
| "step": 697000 |
| }, |
| { |
| "epoch": 6.435629861322557, |
| "grad_norm": 0.9811331629753113, |
| "learning_rate": 1.7821896826934612e-05, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 714235392, |
| "step": 697500 |
| }, |
| { |
| "epoch": 6.440243216061856, |
| "grad_norm": 2.4249658584594727, |
| "learning_rate": 1.7798830053238113e-05, |
| "loss": 0.0199, |
| "num_input_tokens_seen": 714747392, |
| "step": 698000 |
| }, |
| { |
| "epoch": 6.444856570801155, |
| "grad_norm": 1.6844923496246338, |
| "learning_rate": 1.7775763279541617e-05, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 715259392, |
| "step": 698500 |
| }, |
| { |
| "epoch": 6.449469925540455, |
| "grad_norm": 2.2441189289093018, |
| "learning_rate": 1.775269650584512e-05, |
| "loss": 0.0205, |
| "num_input_tokens_seen": 715771392, |
| "step": 699000 |
| }, |
| { |
| "epoch": 6.454083280279754, |
| "grad_norm": 0.4577130973339081, |
| "learning_rate": 1.7729629732148625e-05, |
| "loss": 0.022, |
| "num_input_tokens_seen": 716283392, |
| "step": 699500 |
| }, |
| { |
| "epoch": 6.458696635019053, |
| "grad_norm": 1.2576284408569336, |
| "learning_rate": 1.770656295845213e-05, |
| "loss": 0.021, |
| "num_input_tokens_seen": 716795392, |
| "step": 700000 |
| }, |
| { |
| "epoch": 6.463309989758352, |
| "grad_norm": 1.3181337118148804, |
| "learning_rate": 1.7683496184755633e-05, |
| "loss": 0.0228, |
| "num_input_tokens_seen": 717307392, |
| "step": 700500 |
| }, |
| { |
| "epoch": 6.467923344497652, |
| "grad_norm": 0.6435089707374573, |
| "learning_rate": 1.7660429411059133e-05, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 717819392, |
| "step": 701000 |
| }, |
| { |
| "epoch": 6.472536699236951, |
| "grad_norm": 1.2723332643508911, |
| "learning_rate": 1.763736263736264e-05, |
| "loss": 0.0197, |
| "num_input_tokens_seen": 718331392, |
| "step": 701500 |
| }, |
| { |
| "epoch": 6.47715005397625, |
| "grad_norm": 5.60179328918457, |
| "learning_rate": 1.761429586366614e-05, |
| "loss": 0.0246, |
| "num_input_tokens_seen": 718843392, |
| "step": 702000 |
| }, |
| { |
| "epoch": 6.481763408715549, |
| "grad_norm": 1.1845461130142212, |
| "learning_rate": 1.7591229089969642e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 719355392, |
| "step": 702500 |
| }, |
| { |
| "epoch": 6.4863767634548495, |
| "grad_norm": 0.9325453042984009, |
| "learning_rate": 1.756816231627315e-05, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 719867392, |
| "step": 703000 |
| }, |
| { |
| "epoch": 6.490990118194149, |
| "grad_norm": 1.919224500656128, |
| "learning_rate": 1.754509554257665e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 720379392, |
| "step": 703500 |
| }, |
| { |
| "epoch": 6.495603472933448, |
| "grad_norm": 0.8646382093429565, |
| "learning_rate": 1.7522028768880154e-05, |
| "loss": 0.0227, |
| "num_input_tokens_seen": 720891392, |
| "step": 704000 |
| }, |
| { |
| "epoch": 6.500216827672747, |
| "grad_norm": 0.6728546619415283, |
| "learning_rate": 1.7498961995183658e-05, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 721403392, |
| "step": 704500 |
| }, |
| { |
| "epoch": 6.5048301824120465, |
| "grad_norm": 1.701745629310608, |
| "learning_rate": 1.7475895221487162e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 721915392, |
| "step": 705000 |
| }, |
| { |
| "epoch": 6.509443537151346, |
| "grad_norm": 1.382514476776123, |
| "learning_rate": 1.7452828447790666e-05, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 722427392, |
| "step": 705500 |
| }, |
| { |
| "epoch": 6.514056891890645, |
| "grad_norm": 1.366165041923523, |
| "learning_rate": 1.742976167409417e-05, |
| "loss": 0.0244, |
| "num_input_tokens_seen": 722939392, |
| "step": 706000 |
| }, |
| { |
| "epoch": 6.518670246629944, |
| "grad_norm": 0.727484405040741, |
| "learning_rate": 1.740669490039767e-05, |
| "loss": 0.0215, |
| "num_input_tokens_seen": 723451392, |
| "step": 706500 |
| }, |
| { |
| "epoch": 6.5232836013692435, |
| "grad_norm": 0.9992395043373108, |
| "learning_rate": 1.7383628126701178e-05, |
| "loss": 0.0203, |
| "num_input_tokens_seen": 723963392, |
| "step": 707000 |
| }, |
| { |
| "epoch": 6.527896956108543, |
| "grad_norm": 1.4681673049926758, |
| "learning_rate": 1.736056135300468e-05, |
| "loss": 0.0236, |
| "num_input_tokens_seen": 724475392, |
| "step": 707500 |
| }, |
| { |
| "epoch": 6.532510310847842, |
| "grad_norm": 0.6639313101768494, |
| "learning_rate": 1.7337494579308182e-05, |
| "loss": 0.0196, |
| "num_input_tokens_seen": 724987392, |
| "step": 708000 |
| }, |
| { |
| "epoch": 6.537123665587142, |
| "grad_norm": 1.4685230255126953, |
| "learning_rate": 1.7314427805611686e-05, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 725499392, |
| "step": 708500 |
| }, |
| { |
| "epoch": 6.541737020326441, |
| "grad_norm": 0.711995542049408, |
| "learning_rate": 1.729136103191519e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 726011392, |
| "step": 709000 |
| }, |
| { |
| "epoch": 6.546350375065741, |
| "grad_norm": 0.849071204662323, |
| "learning_rate": 1.726829425821869e-05, |
| "loss": 0.0219, |
| "num_input_tokens_seen": 726523392, |
| "step": 709500 |
| }, |
| { |
| "epoch": 6.55096372980504, |
| "grad_norm": 0.7562097311019897, |
| "learning_rate": 1.7245227484522195e-05, |
| "loss": 0.0203, |
| "num_input_tokens_seen": 727035392, |
| "step": 710000 |
| }, |
| { |
| "epoch": 6.555577084544339, |
| "grad_norm": 1.556663155555725, |
| "learning_rate": 1.72221607108257e-05, |
| "loss": 0.0224, |
| "num_input_tokens_seen": 727547392, |
| "step": 710500 |
| }, |
| { |
| "epoch": 6.560190439283638, |
| "grad_norm": 3.2554850578308105, |
| "learning_rate": 1.7199093937129203e-05, |
| "loss": 0.022, |
| "num_input_tokens_seen": 728059392, |
| "step": 711000 |
| }, |
| { |
| "epoch": 6.564803794022938, |
| "grad_norm": 1.4903610944747925, |
| "learning_rate": 1.7176027163432707e-05, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 728571392, |
| "step": 711500 |
| }, |
| { |
| "epoch": 6.569417148762237, |
| "grad_norm": 1.828810691833496, |
| "learning_rate": 1.7152960389736207e-05, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 729083392, |
| "step": 712000 |
| }, |
| { |
| "epoch": 6.574030503501536, |
| "grad_norm": 0.5452165603637695, |
| "learning_rate": 1.7129893616039715e-05, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 729595392, |
| "step": 712500 |
| }, |
| { |
| "epoch": 6.578643858240835, |
| "grad_norm": 1.4269682168960571, |
| "learning_rate": 1.7106826842343215e-05, |
| "loss": 0.0222, |
| "num_input_tokens_seen": 730107392, |
| "step": 713000 |
| }, |
| { |
| "epoch": 6.5832572129801346, |
| "grad_norm": 0.5227313041687012, |
| "learning_rate": 1.708376006864672e-05, |
| "loss": 0.0201, |
| "num_input_tokens_seen": 730619392, |
| "step": 713500 |
| }, |
| { |
| "epoch": 6.587870567719435, |
| "grad_norm": 0.8635200262069702, |
| "learning_rate": 1.7060693294950223e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 731131392, |
| "step": 714000 |
| }, |
| { |
| "epoch": 6.592483922458733, |
| "grad_norm": 1.070576548576355, |
| "learning_rate": 1.7037626521253727e-05, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 731643392, |
| "step": 714500 |
| }, |
| { |
| "epoch": 6.597097277198033, |
| "grad_norm": 21.42013931274414, |
| "learning_rate": 1.7014559747557228e-05, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 732155392, |
| "step": 715000 |
| }, |
| { |
| "epoch": 6.601710631937332, |
| "grad_norm": 1.3582208156585693, |
| "learning_rate": 1.6991492973860735e-05, |
| "loss": 0.0222, |
| "num_input_tokens_seen": 732667392, |
| "step": 715500 |
| }, |
| { |
| "epoch": 6.606323986676632, |
| "grad_norm": 1.3939865827560425, |
| "learning_rate": 1.6968426200164236e-05, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 733179392, |
| "step": 716000 |
| }, |
| { |
| "epoch": 6.610937341415931, |
| "grad_norm": 1.0751606225967407, |
| "learning_rate": 1.694535942646774e-05, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 733691392, |
| "step": 716500 |
| }, |
| { |
| "epoch": 6.61555069615523, |
| "grad_norm": 1.630864143371582, |
| "learning_rate": 1.6922292652771244e-05, |
| "loss": 0.0215, |
| "num_input_tokens_seen": 734203392, |
| "step": 717000 |
| }, |
| { |
| "epoch": 6.620164050894529, |
| "grad_norm": 0.7903428077697754, |
| "learning_rate": 1.6899225879074744e-05, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 734715392, |
| "step": 717500 |
| }, |
| { |
| "epoch": 6.624777405633829, |
| "grad_norm": 0.9173442125320435, |
| "learning_rate": 1.687615910537825e-05, |
| "loss": 0.0205, |
| "num_input_tokens_seen": 735227392, |
| "step": 718000 |
| }, |
| { |
| "epoch": 6.629390760373128, |
| "grad_norm": 0.4864923059940338, |
| "learning_rate": 1.6853092331681752e-05, |
| "loss": 0.0222, |
| "num_input_tokens_seen": 735739392, |
| "step": 718500 |
| }, |
| { |
| "epoch": 6.634004115112427, |
| "grad_norm": 2.9184951782226562, |
| "learning_rate": 1.6830025557985256e-05, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 736251392, |
| "step": 719000 |
| }, |
| { |
| "epoch": 6.638617469851726, |
| "grad_norm": 0.9503863453865051, |
| "learning_rate": 1.680695878428876e-05, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 736763392, |
| "step": 719500 |
| }, |
| { |
| "epoch": 6.643230824591026, |
| "grad_norm": 1.129035234451294, |
| "learning_rate": 1.6783892010592264e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 737275392, |
| "step": 720000 |
| }, |
| { |
| "epoch": 6.647844179330326, |
| "grad_norm": 0.7650052309036255, |
| "learning_rate": 1.6760825236895768e-05, |
| "loss": 0.0209, |
| "num_input_tokens_seen": 737787392, |
| "step": 720500 |
| }, |
| { |
| "epoch": 6.652457534069625, |
| "grad_norm": 1.070244312286377, |
| "learning_rate": 1.6737758463199272e-05, |
| "loss": 0.0201, |
| "num_input_tokens_seen": 738299392, |
| "step": 721000 |
| }, |
| { |
| "epoch": 6.657070888808924, |
| "grad_norm": 1.1811015605926514, |
| "learning_rate": 1.6714691689502773e-05, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 738811392, |
| "step": 721500 |
| }, |
| { |
| "epoch": 6.6616842435482235, |
| "grad_norm": 1.0393638610839844, |
| "learning_rate": 1.6691624915806277e-05, |
| "loss": 0.0227, |
| "num_input_tokens_seen": 739323392, |
| "step": 722000 |
| }, |
| { |
| "epoch": 6.666297598287523, |
| "grad_norm": 1.2030943632125854, |
| "learning_rate": 1.666855814210978e-05, |
| "loss": 0.0234, |
| "num_input_tokens_seen": 739835392, |
| "step": 722500 |
| }, |
| { |
| "epoch": 6.670910953026822, |
| "grad_norm": 0.676896870136261, |
| "learning_rate": 1.664549136841328e-05, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 740347392, |
| "step": 723000 |
| }, |
| { |
| "epoch": 6.675524307766121, |
| "grad_norm": 0.9208011031150818, |
| "learning_rate": 1.662242459471679e-05, |
| "loss": 0.0235, |
| "num_input_tokens_seen": 740859392, |
| "step": 723500 |
| }, |
| { |
| "epoch": 6.6801376625054205, |
| "grad_norm": 0.5715643763542175, |
| "learning_rate": 1.659935782102029e-05, |
| "loss": 0.0192, |
| "num_input_tokens_seen": 741371392, |
| "step": 724000 |
| }, |
| { |
| "epoch": 6.68475101724472, |
| "grad_norm": 3.038097381591797, |
| "learning_rate": 1.6576291047323793e-05, |
| "loss": 0.0227, |
| "num_input_tokens_seen": 741883392, |
| "step": 724500 |
| }, |
| { |
| "epoch": 6.689364371984019, |
| "grad_norm": 0.7479985952377319, |
| "learning_rate": 1.6553224273627297e-05, |
| "loss": 0.022, |
| "num_input_tokens_seen": 742395392, |
| "step": 725000 |
| }, |
| { |
| "epoch": 6.693977726723318, |
| "grad_norm": 0.4049575626850128, |
| "learning_rate": 1.65301574999308e-05, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 742907392, |
| "step": 725500 |
| }, |
| { |
| "epoch": 6.698591081462618, |
| "grad_norm": 1.12605881690979, |
| "learning_rate": 1.6507090726234305e-05, |
| "loss": 0.0227, |
| "num_input_tokens_seen": 743419392, |
| "step": 726000 |
| }, |
| { |
| "epoch": 6.703204436201918, |
| "grad_norm": 0.9142519235610962, |
| "learning_rate": 1.648402395253781e-05, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 743931392, |
| "step": 726500 |
| }, |
| { |
| "epoch": 6.707817790941217, |
| "grad_norm": 2.4688339233398438, |
| "learning_rate": 1.646095717884131e-05, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 744443392, |
| "step": 727000 |
| }, |
| { |
| "epoch": 6.712431145680516, |
| "grad_norm": 0.49617233872413635, |
| "learning_rate": 1.6437890405144817e-05, |
| "loss": 0.0215, |
| "num_input_tokens_seen": 744955392, |
| "step": 727500 |
| }, |
| { |
| "epoch": 6.717044500419815, |
| "grad_norm": 2.4510884284973145, |
| "learning_rate": 1.6414823631448317e-05, |
| "loss": 0.022, |
| "num_input_tokens_seen": 745467392, |
| "step": 728000 |
| }, |
| { |
| "epoch": 6.721657855159115, |
| "grad_norm": 0.6233497262001038, |
| "learning_rate": 1.639175685775182e-05, |
| "loss": 0.0209, |
| "num_input_tokens_seen": 745979392, |
| "step": 728500 |
| }, |
| { |
| "epoch": 6.726271209898414, |
| "grad_norm": 1.1352206468582153, |
| "learning_rate": 1.6368690084055325e-05, |
| "loss": 0.0194, |
| "num_input_tokens_seen": 746491392, |
| "step": 729000 |
| }, |
| { |
| "epoch": 6.730884564637713, |
| "grad_norm": 0.4292503297328949, |
| "learning_rate": 1.6345623310358826e-05, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 747003392, |
| "step": 729500 |
| }, |
| { |
| "epoch": 6.735497919377012, |
| "grad_norm": 0.7327638864517212, |
| "learning_rate": 1.632255653666233e-05, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 747515392, |
| "step": 730000 |
| }, |
| { |
| "epoch": 6.740111274116312, |
| "grad_norm": 1.2657952308654785, |
| "learning_rate": 1.6299489762965834e-05, |
| "loss": 0.021, |
| "num_input_tokens_seen": 748027392, |
| "step": 730500 |
| }, |
| { |
| "epoch": 6.744724628855611, |
| "grad_norm": 2.1072635650634766, |
| "learning_rate": 1.6276422989269338e-05, |
| "loss": 0.0205, |
| "num_input_tokens_seen": 748539392, |
| "step": 731000 |
| }, |
| { |
| "epoch": 6.749337983594911, |
| "grad_norm": 0.5420140027999878, |
| "learning_rate": 1.6253356215572842e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 749051392, |
| "step": 731500 |
| }, |
| { |
| "epoch": 6.75395133833421, |
| "grad_norm": 0.9647169709205627, |
| "learning_rate": 1.6230289441876346e-05, |
| "loss": 0.023, |
| "num_input_tokens_seen": 749563392, |
| "step": 732000 |
| }, |
| { |
| "epoch": 6.7585646930735095, |
| "grad_norm": 0.5795858502388, |
| "learning_rate": 1.6207222668179846e-05, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 750075392, |
| "step": 732500 |
| }, |
| { |
| "epoch": 6.763178047812809, |
| "grad_norm": 0.776720404624939, |
| "learning_rate": 1.6184155894483354e-05, |
| "loss": 0.0239, |
| "num_input_tokens_seen": 750587392, |
| "step": 733000 |
| }, |
| { |
| "epoch": 6.767791402552108, |
| "grad_norm": 3.4119088649749756, |
| "learning_rate": 1.6161089120786854e-05, |
| "loss": 0.0234, |
| "num_input_tokens_seen": 751099392, |
| "step": 733500 |
| }, |
| { |
| "epoch": 6.772404757291407, |
| "grad_norm": 0.5689214468002319, |
| "learning_rate": 1.6138022347090358e-05, |
| "loss": 0.021, |
| "num_input_tokens_seen": 751611392, |
| "step": 734000 |
| }, |
| { |
| "epoch": 6.7770181120307065, |
| "grad_norm": 0.6440141201019287, |
| "learning_rate": 1.6114955573393862e-05, |
| "loss": 0.0228, |
| "num_input_tokens_seen": 752123392, |
| "step": 734500 |
| }, |
| { |
| "epoch": 6.781631466770006, |
| "grad_norm": 0.5016751289367676, |
| "learning_rate": 1.6091888799697366e-05, |
| "loss": 0.023, |
| "num_input_tokens_seen": 752635392, |
| "step": 735000 |
| }, |
| { |
| "epoch": 6.786244821509305, |
| "grad_norm": 0.6144362092018127, |
| "learning_rate": 1.6068822026000867e-05, |
| "loss": 0.0227, |
| "num_input_tokens_seen": 753147392, |
| "step": 735500 |
| }, |
| { |
| "epoch": 6.790858176248604, |
| "grad_norm": 0.356981486082077, |
| "learning_rate": 1.604575525230437e-05, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 753659392, |
| "step": 736000 |
| }, |
| { |
| "epoch": 6.7954715309879035, |
| "grad_norm": 0.6662021279335022, |
| "learning_rate": 1.6022688478607875e-05, |
| "loss": 0.0222, |
| "num_input_tokens_seen": 754171392, |
| "step": 736500 |
| }, |
| { |
| "epoch": 6.800084885727204, |
| "grad_norm": 1.0647578239440918, |
| "learning_rate": 1.599962170491138e-05, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 754683392, |
| "step": 737000 |
| }, |
| { |
| "epoch": 6.804698240466502, |
| "grad_norm": 0.8494476675987244, |
| "learning_rate": 1.5976554931214883e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 755195392, |
| "step": 737500 |
| }, |
| { |
| "epoch": 6.809311595205802, |
| "grad_norm": 1.5736192464828491, |
| "learning_rate": 1.5953488157518383e-05, |
| "loss": 0.0245, |
| "num_input_tokens_seen": 755707392, |
| "step": 738000 |
| }, |
| { |
| "epoch": 6.813924949945101, |
| "grad_norm": 1.5811710357666016, |
| "learning_rate": 1.593042138382189e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 756219392, |
| "step": 738500 |
| }, |
| { |
| "epoch": 6.818538304684401, |
| "grad_norm": 0.7430917024612427, |
| "learning_rate": 1.590735461012539e-05, |
| "loss": 0.0238, |
| "num_input_tokens_seen": 756731392, |
| "step": 739000 |
| }, |
| { |
| "epoch": 6.8231516594237, |
| "grad_norm": 0.346450537443161, |
| "learning_rate": 1.5884287836428895e-05, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 757243392, |
| "step": 739500 |
| }, |
| { |
| "epoch": 6.827765014162999, |
| "grad_norm": 5.301863670349121, |
| "learning_rate": 1.58612210627324e-05, |
| "loss": 0.0228, |
| "num_input_tokens_seen": 757755392, |
| "step": 740000 |
| }, |
| { |
| "epoch": 6.832378368902298, |
| "grad_norm": 0.9501894116401672, |
| "learning_rate": 1.5838154289035903e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 758267392, |
| "step": 740500 |
| }, |
| { |
| "epoch": 6.836991723641598, |
| "grad_norm": 0.4030236601829529, |
| "learning_rate": 1.5815087515339404e-05, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 758779392, |
| "step": 741000 |
| }, |
| { |
| "epoch": 6.841605078380897, |
| "grad_norm": 3.976102352142334, |
| "learning_rate": 1.579202074164291e-05, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 759291392, |
| "step": 741500 |
| }, |
| { |
| "epoch": 6.846218433120196, |
| "grad_norm": 1.0763275623321533, |
| "learning_rate": 1.576895396794641e-05, |
| "loss": 0.0247, |
| "num_input_tokens_seen": 759803392, |
| "step": 742000 |
| }, |
| { |
| "epoch": 6.850831787859495, |
| "grad_norm": 1.278295636177063, |
| "learning_rate": 1.5745887194249916e-05, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 760315392, |
| "step": 742500 |
| }, |
| { |
| "epoch": 6.855445142598795, |
| "grad_norm": 1.3523164987564087, |
| "learning_rate": 1.572282042055342e-05, |
| "loss": 0.0215, |
| "num_input_tokens_seen": 760827392, |
| "step": 743000 |
| }, |
| { |
| "epoch": 6.860058497338095, |
| "grad_norm": 2.487576484680176, |
| "learning_rate": 1.569975364685692e-05, |
| "loss": 0.0197, |
| "num_input_tokens_seen": 761339392, |
| "step": 743500 |
| }, |
| { |
| "epoch": 6.864671852077394, |
| "grad_norm": 0.43189629912376404, |
| "learning_rate": 1.5676686873160428e-05, |
| "loss": 0.0209, |
| "num_input_tokens_seen": 761851392, |
| "step": 744000 |
| }, |
| { |
| "epoch": 6.869285206816693, |
| "grad_norm": 1.3960847854614258, |
| "learning_rate": 1.5653620099463928e-05, |
| "loss": 0.0211, |
| "num_input_tokens_seen": 762363392, |
| "step": 744500 |
| }, |
| { |
| "epoch": 6.873898561555992, |
| "grad_norm": 0.642167866230011, |
| "learning_rate": 1.5630553325767432e-05, |
| "loss": 0.0228, |
| "num_input_tokens_seen": 762875392, |
| "step": 745000 |
| }, |
| { |
| "epoch": 6.878511916295292, |
| "grad_norm": 0.7163909673690796, |
| "learning_rate": 1.5607486552070936e-05, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 763387392, |
| "step": 745500 |
| }, |
| { |
| "epoch": 6.883125271034591, |
| "grad_norm": 0.8028944134712219, |
| "learning_rate": 1.558441977837444e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 763899392, |
| "step": 746000 |
| }, |
| { |
| "epoch": 6.88773862577389, |
| "grad_norm": 0.8963446617126465, |
| "learning_rate": 1.556135300467794e-05, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 764411392, |
| "step": 746500 |
| }, |
| { |
| "epoch": 6.892351980513189, |
| "grad_norm": 1.2736632823944092, |
| "learning_rate": 1.5538286230981448e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 764923392, |
| "step": 747000 |
| }, |
| { |
| "epoch": 6.896965335252489, |
| "grad_norm": 1.9002121686935425, |
| "learning_rate": 1.551521945728495e-05, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 765435392, |
| "step": 747500 |
| }, |
| { |
| "epoch": 6.901578689991788, |
| "grad_norm": 1.7518917322158813, |
| "learning_rate": 1.5492152683588453e-05, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 765947392, |
| "step": 748000 |
| }, |
| { |
| "epoch": 6.906192044731087, |
| "grad_norm": 0.5055529475212097, |
| "learning_rate": 1.5469085909891956e-05, |
| "loss": 0.0223, |
| "num_input_tokens_seen": 766459392, |
| "step": 748500 |
| }, |
| { |
| "epoch": 6.910805399470387, |
| "grad_norm": 1.280887246131897, |
| "learning_rate": 1.5446019136195457e-05, |
| "loss": 0.0215, |
| "num_input_tokens_seen": 766971392, |
| "step": 749000 |
| }, |
| { |
| "epoch": 6.9154187542096865, |
| "grad_norm": 1.3082467317581177, |
| "learning_rate": 1.5422952362498964e-05, |
| "loss": 0.0239, |
| "num_input_tokens_seen": 767483392, |
| "step": 749500 |
| }, |
| { |
| "epoch": 6.920032108948986, |
| "grad_norm": 0.4849281907081604, |
| "learning_rate": 1.5399885588802465e-05, |
| "loss": 0.0219, |
| "num_input_tokens_seen": 767995392, |
| "step": 750000 |
| }, |
| { |
| "epoch": 6.924645463688285, |
| "grad_norm": 1.54342520236969, |
| "learning_rate": 1.537681881510597e-05, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 768507392, |
| "step": 750500 |
| }, |
| { |
| "epoch": 6.929258818427584, |
| "grad_norm": 1.441550850868225, |
| "learning_rate": 1.5353752041409473e-05, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 769019392, |
| "step": 751000 |
| }, |
| { |
| "epoch": 6.9338721731668835, |
| "grad_norm": 1.3304697275161743, |
| "learning_rate": 1.5330685267712977e-05, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 769531392, |
| "step": 751500 |
| }, |
| { |
| "epoch": 6.938485527906183, |
| "grad_norm": 1.3655359745025635, |
| "learning_rate": 1.5307618494016477e-05, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 770043392, |
| "step": 752000 |
| }, |
| { |
| "epoch": 6.943098882645482, |
| "grad_norm": 1.3380628824234009, |
| "learning_rate": 1.5284551720319985e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 770555392, |
| "step": 752500 |
| }, |
| { |
| "epoch": 6.947712237384781, |
| "grad_norm": 0.7669854164123535, |
| "learning_rate": 1.5261484946623485e-05, |
| "loss": 0.0219, |
| "num_input_tokens_seen": 771067392, |
| "step": 753000 |
| }, |
| { |
| "epoch": 6.9523255921240805, |
| "grad_norm": 0.653236985206604, |
| "learning_rate": 1.5238418172926991e-05, |
| "loss": 0.0197, |
| "num_input_tokens_seen": 771579392, |
| "step": 753500 |
| }, |
| { |
| "epoch": 6.95693894686338, |
| "grad_norm": 0.7252629995346069, |
| "learning_rate": 1.5215351399230493e-05, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 772091392, |
| "step": 754000 |
| }, |
| { |
| "epoch": 6.96155230160268, |
| "grad_norm": 0.7869466543197632, |
| "learning_rate": 1.5192284625533997e-05, |
| "loss": 0.0219, |
| "num_input_tokens_seen": 772603392, |
| "step": 754500 |
| }, |
| { |
| "epoch": 6.966165656341978, |
| "grad_norm": 1.048891544342041, |
| "learning_rate": 1.51692178518375e-05, |
| "loss": 0.0246, |
| "num_input_tokens_seen": 773115392, |
| "step": 755000 |
| }, |
| { |
| "epoch": 6.970779011081278, |
| "grad_norm": 0.7492154836654663, |
| "learning_rate": 1.5146151078141002e-05, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 773627392, |
| "step": 755500 |
| }, |
| { |
| "epoch": 6.975392365820578, |
| "grad_norm": 1.5296510457992554, |
| "learning_rate": 1.5123084304444508e-05, |
| "loss": 0.023, |
| "num_input_tokens_seen": 774139392, |
| "step": 756000 |
| }, |
| { |
| "epoch": 6.980005720559877, |
| "grad_norm": 0.6391850113868713, |
| "learning_rate": 1.510001753074801e-05, |
| "loss": 0.0211, |
| "num_input_tokens_seen": 774651392, |
| "step": 756500 |
| }, |
| { |
| "epoch": 6.984619075299176, |
| "grad_norm": 1.2069010734558105, |
| "learning_rate": 1.5076950757051514e-05, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 775163392, |
| "step": 757000 |
| }, |
| { |
| "epoch": 6.989232430038475, |
| "grad_norm": 2.368687629699707, |
| "learning_rate": 1.5053883983355016e-05, |
| "loss": 0.024, |
| "num_input_tokens_seen": 775675392, |
| "step": 757500 |
| }, |
| { |
| "epoch": 6.993845784777775, |
| "grad_norm": 1.284287452697754, |
| "learning_rate": 1.5030817209658522e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 776187392, |
| "step": 758000 |
| }, |
| { |
| "epoch": 6.998459139517074, |
| "grad_norm": 5.402317523956299, |
| "learning_rate": 1.5007750435962022e-05, |
| "loss": 0.025, |
| "num_input_tokens_seen": 776699392, |
| "step": 758500 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_combined_score": 0.06412914552577642, |
| "eval_loss": 0.06412914395332336, |
| "eval_mse": 0.06412914709822949, |
| "eval_runtime": 47.0336, |
| "eval_samples_per_second": 2048.28, |
| "eval_steps_per_second": 256.051, |
| "num_input_tokens_seen": 776869632, |
| "step": 758667 |
| }, |
| { |
| "epoch": 7.003072494256373, |
| "grad_norm": 1.346767783164978, |
| "learning_rate": 1.4984683662265528e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 777210624, |
| "step": 759000 |
| }, |
| { |
| "epoch": 7.007685848995672, |
| "grad_norm": 0.9796298146247864, |
| "learning_rate": 1.496161688856903e-05, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 777722624, |
| "step": 759500 |
| }, |
| { |
| "epoch": 7.012299203734972, |
| "grad_norm": 1.2551716566085815, |
| "learning_rate": 1.4938550114872534e-05, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 778234624, |
| "step": 760000 |
| }, |
| { |
| "epoch": 7.016912558474272, |
| "grad_norm": 0.8987337946891785, |
| "learning_rate": 1.4915483341176037e-05, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 778746624, |
| "step": 760500 |
| }, |
| { |
| "epoch": 7.021525913213571, |
| "grad_norm": 0.38303157687187195, |
| "learning_rate": 1.4892416567479542e-05, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 779258624, |
| "step": 761000 |
| }, |
| { |
| "epoch": 7.02613926795287, |
| "grad_norm": 1.3380213975906372, |
| "learning_rate": 1.4869349793783044e-05, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 779770624, |
| "step": 761500 |
| }, |
| { |
| "epoch": 7.0307526226921695, |
| "grad_norm": 2.466179609298706, |
| "learning_rate": 1.4846283020086547e-05, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 780282624, |
| "step": 762000 |
| }, |
| { |
| "epoch": 7.035365977431469, |
| "grad_norm": 0.4640190303325653, |
| "learning_rate": 1.482321624639005e-05, |
| "loss": 0.0168, |
| "num_input_tokens_seen": 780794624, |
| "step": 762500 |
| }, |
| { |
| "epoch": 7.039979332170768, |
| "grad_norm": 0.6390454173088074, |
| "learning_rate": 1.4800149472693553e-05, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 781306624, |
| "step": 763000 |
| }, |
| { |
| "epoch": 7.044592686910067, |
| "grad_norm": 0.9119462966918945, |
| "learning_rate": 1.4777082698997059e-05, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 781818624, |
| "step": 763500 |
| }, |
| { |
| "epoch": 7.0492060416493665, |
| "grad_norm": 1.088921070098877, |
| "learning_rate": 1.475401592530056e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 782330624, |
| "step": 764000 |
| }, |
| { |
| "epoch": 7.053819396388666, |
| "grad_norm": 0.5869113802909851, |
| "learning_rate": 1.4730949151604065e-05, |
| "loss": 0.0191, |
| "num_input_tokens_seen": 782842624, |
| "step": 764500 |
| }, |
| { |
| "epoch": 7.058432751127965, |
| "grad_norm": 1.6925584077835083, |
| "learning_rate": 1.4707882377907567e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 783354624, |
| "step": 765000 |
| }, |
| { |
| "epoch": 7.063046105867264, |
| "grad_norm": 1.0733281373977661, |
| "learning_rate": 1.4684815604211071e-05, |
| "loss": 0.0171, |
| "num_input_tokens_seen": 783866624, |
| "step": 765500 |
| }, |
| { |
| "epoch": 7.0676594606065635, |
| "grad_norm": 0.3278258442878723, |
| "learning_rate": 1.4661748830514573e-05, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 784378624, |
| "step": 766000 |
| }, |
| { |
| "epoch": 7.072272815345864, |
| "grad_norm": 2.2622592449188232, |
| "learning_rate": 1.4638682056818079e-05, |
| "loss": 0.0171, |
| "num_input_tokens_seen": 784890624, |
| "step": 766500 |
| }, |
| { |
| "epoch": 7.076886170085163, |
| "grad_norm": 0.846518337726593, |
| "learning_rate": 1.4615615283121581e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 785402624, |
| "step": 767000 |
| }, |
| { |
| "epoch": 7.081499524824462, |
| "grad_norm": 0.9698590636253357, |
| "learning_rate": 1.4592548509425085e-05, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 785914624, |
| "step": 767500 |
| }, |
| { |
| "epoch": 7.086112879563761, |
| "grad_norm": 0.5238065123558044, |
| "learning_rate": 1.4569481735728588e-05, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 786426624, |
| "step": 768000 |
| }, |
| { |
| "epoch": 7.090726234303061, |
| "grad_norm": 0.7391173839569092, |
| "learning_rate": 1.454641496203209e-05, |
| "loss": 0.0193, |
| "num_input_tokens_seen": 786938624, |
| "step": 768500 |
| }, |
| { |
| "epoch": 7.09533958904236, |
| "grad_norm": 0.8646796941757202, |
| "learning_rate": 1.4523348188335596e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 787450624, |
| "step": 769000 |
| }, |
| { |
| "epoch": 7.099952943781659, |
| "grad_norm": 0.5301780700683594, |
| "learning_rate": 1.4500281414639096e-05, |
| "loss": 0.017, |
| "num_input_tokens_seen": 787962624, |
| "step": 769500 |
| }, |
| { |
| "epoch": 7.104566298520958, |
| "grad_norm": 2.3351125717163086, |
| "learning_rate": 1.4477214640942602e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 788474624, |
| "step": 770000 |
| }, |
| { |
| "epoch": 7.109179653260258, |
| "grad_norm": 0.59925377368927, |
| "learning_rate": 1.4454147867246104e-05, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 788986624, |
| "step": 770500 |
| }, |
| { |
| "epoch": 7.113793007999557, |
| "grad_norm": 0.5372639298439026, |
| "learning_rate": 1.4431081093549608e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 789498624, |
| "step": 771000 |
| }, |
| { |
| "epoch": 7.118406362738856, |
| "grad_norm": 1.028199553489685, |
| "learning_rate": 1.440801431985311e-05, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 790010624, |
| "step": 771500 |
| }, |
| { |
| "epoch": 7.123019717478156, |
| "grad_norm": 0.32566505670547485, |
| "learning_rate": 1.4384947546156616e-05, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 790522624, |
| "step": 772000 |
| }, |
| { |
| "epoch": 7.1276330722174555, |
| "grad_norm": 1.434348702430725, |
| "learning_rate": 1.4361880772460118e-05, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 791034624, |
| "step": 772500 |
| }, |
| { |
| "epoch": 7.132246426956755, |
| "grad_norm": 1.0634896755218506, |
| "learning_rate": 1.4338813998763622e-05, |
| "loss": 0.0172, |
| "num_input_tokens_seen": 791546624, |
| "step": 773000 |
| }, |
| { |
| "epoch": 7.136859781696054, |
| "grad_norm": 1.0522830486297607, |
| "learning_rate": 1.4315747225067125e-05, |
| "loss": 0.017, |
| "num_input_tokens_seen": 792058624, |
| "step": 773500 |
| }, |
| { |
| "epoch": 7.141473136435353, |
| "grad_norm": 1.2891104221343994, |
| "learning_rate": 1.429268045137063e-05, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 792570624, |
| "step": 774000 |
| }, |
| { |
| "epoch": 7.1460864911746524, |
| "grad_norm": 0.5944826006889343, |
| "learning_rate": 1.4269613677674132e-05, |
| "loss": 0.0168, |
| "num_input_tokens_seen": 793082624, |
| "step": 774500 |
| }, |
| { |
| "epoch": 7.150699845913952, |
| "grad_norm": 1.0896071195602417, |
| "learning_rate": 1.4246546903977635e-05, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 793594624, |
| "step": 775000 |
| }, |
| { |
| "epoch": 7.155313200653251, |
| "grad_norm": 0.5116850137710571, |
| "learning_rate": 1.4223480130281139e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 794106624, |
| "step": 775500 |
| }, |
| { |
| "epoch": 7.15992655539255, |
| "grad_norm": 0.6353034377098083, |
| "learning_rate": 1.4200413356584641e-05, |
| "loss": 0.015, |
| "num_input_tokens_seen": 794618624, |
| "step": 776000 |
| }, |
| { |
| "epoch": 7.164539910131849, |
| "grad_norm": 2.1156020164489746, |
| "learning_rate": 1.4177346582888145e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 795130624, |
| "step": 776500 |
| }, |
| { |
| "epoch": 7.169153264871149, |
| "grad_norm": 0.4953656494617462, |
| "learning_rate": 1.4154279809191647e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 795642624, |
| "step": 777000 |
| }, |
| { |
| "epoch": 7.173766619610448, |
| "grad_norm": 0.39725926518440247, |
| "learning_rate": 1.4131213035495153e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 796154624, |
| "step": 777500 |
| }, |
| { |
| "epoch": 7.178379974349748, |
| "grad_norm": 0.7973536849021912, |
| "learning_rate": 1.4108146261798655e-05, |
| "loss": 0.0192, |
| "num_input_tokens_seen": 796666624, |
| "step": 778000 |
| }, |
| { |
| "epoch": 7.182993329089047, |
| "grad_norm": 0.27644041180610657, |
| "learning_rate": 1.4085079488102159e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 797178624, |
| "step": 778500 |
| }, |
| { |
| "epoch": 7.1876066838283466, |
| "grad_norm": 0.5681914687156677, |
| "learning_rate": 1.4062012714405661e-05, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 797690624, |
| "step": 779000 |
| }, |
| { |
| "epoch": 7.192220038567646, |
| "grad_norm": 0.19514349102973938, |
| "learning_rate": 1.4038945940709167e-05, |
| "loss": 0.0176, |
| "num_input_tokens_seen": 798202624, |
| "step": 779500 |
| }, |
| { |
| "epoch": 7.196833393306945, |
| "grad_norm": 1.4721050262451172, |
| "learning_rate": 1.401587916701267e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 798714624, |
| "step": 780000 |
| }, |
| { |
| "epoch": 7.201446748046244, |
| "grad_norm": 0.7421937584877014, |
| "learning_rate": 1.3992812393316173e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 799226624, |
| "step": 780500 |
| }, |
| { |
| "epoch": 7.2060601027855435, |
| "grad_norm": 0.12846527993679047, |
| "learning_rate": 1.3969745619619676e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 799738624, |
| "step": 781000 |
| }, |
| { |
| "epoch": 7.210673457524843, |
| "grad_norm": 0.8358561992645264, |
| "learning_rate": 1.3946678845923178e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 800250624, |
| "step": 781500 |
| }, |
| { |
| "epoch": 7.215286812264142, |
| "grad_norm": 1.0720690488815308, |
| "learning_rate": 1.3923612072226684e-05, |
| "loss": 0.0161, |
| "num_input_tokens_seen": 800762624, |
| "step": 782000 |
| }, |
| { |
| "epoch": 7.219900167003441, |
| "grad_norm": 0.4553976356983185, |
| "learning_rate": 1.3900545298530184e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 801274624, |
| "step": 782500 |
| }, |
| { |
| "epoch": 7.2245135217427405, |
| "grad_norm": 1.1510006189346313, |
| "learning_rate": 1.387747852483369e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 801786624, |
| "step": 783000 |
| }, |
| { |
| "epoch": 7.22912687648204, |
| "grad_norm": 1.1483092308044434, |
| "learning_rate": 1.3854411751137192e-05, |
| "loss": 0.0171, |
| "num_input_tokens_seen": 802298624, |
| "step": 783500 |
| }, |
| { |
| "epoch": 7.23374023122134, |
| "grad_norm": 0.4925529658794403, |
| "learning_rate": 1.3831344977440696e-05, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 802810624, |
| "step": 784000 |
| }, |
| { |
| "epoch": 7.238353585960639, |
| "grad_norm": 0.3787945508956909, |
| "learning_rate": 1.3808278203744198e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 803322624, |
| "step": 784500 |
| }, |
| { |
| "epoch": 7.242966940699938, |
| "grad_norm": 0.6160422563552856, |
| "learning_rate": 1.3785211430047704e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 803834624, |
| "step": 785000 |
| }, |
| { |
| "epoch": 7.247580295439238, |
| "grad_norm": 1.1294529438018799, |
| "learning_rate": 1.3762144656351206e-05, |
| "loss": 0.02, |
| "num_input_tokens_seen": 804346624, |
| "step": 785500 |
| }, |
| { |
| "epoch": 7.252193650178537, |
| "grad_norm": 0.6138213872909546, |
| "learning_rate": 1.373907788265471e-05, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 804858624, |
| "step": 786000 |
| }, |
| { |
| "epoch": 7.256807004917836, |
| "grad_norm": 0.5684888362884521, |
| "learning_rate": 1.3716011108958212e-05, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 805370624, |
| "step": 786500 |
| }, |
| { |
| "epoch": 7.261420359657135, |
| "grad_norm": 0.7051540613174438, |
| "learning_rate": 1.3692944335261718e-05, |
| "loss": 0.0161, |
| "num_input_tokens_seen": 805882624, |
| "step": 787000 |
| }, |
| { |
| "epoch": 7.266033714396435, |
| "grad_norm": 0.7892741560935974, |
| "learning_rate": 1.366987756156522e-05, |
| "loss": 0.0193, |
| "num_input_tokens_seen": 806394624, |
| "step": 787500 |
| }, |
| { |
| "epoch": 7.270647069135734, |
| "grad_norm": 1.084768533706665, |
| "learning_rate": 1.3646810787868721e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 806906624, |
| "step": 788000 |
| }, |
| { |
| "epoch": 7.275260423875033, |
| "grad_norm": 1.111611008644104, |
| "learning_rate": 1.3623744014172227e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 807418624, |
| "step": 788500 |
| }, |
| { |
| "epoch": 7.279873778614332, |
| "grad_norm": 1.2572911977767944, |
| "learning_rate": 1.3600677240475729e-05, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 807930624, |
| "step": 789000 |
| }, |
| { |
| "epoch": 7.2844871333536325, |
| "grad_norm": 1.4147090911865234, |
| "learning_rate": 1.3577610466779233e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 808442624, |
| "step": 789500 |
| }, |
| { |
| "epoch": 7.289100488092932, |
| "grad_norm": 1.129238247871399, |
| "learning_rate": 1.3554543693082735e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 808954624, |
| "step": 790000 |
| }, |
| { |
| "epoch": 7.293713842832231, |
| "grad_norm": 0.7517364621162415, |
| "learning_rate": 1.3531476919386241e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 809466624, |
| "step": 790500 |
| }, |
| { |
| "epoch": 7.29832719757153, |
| "grad_norm": 2.005709171295166, |
| "learning_rate": 1.3508410145689743e-05, |
| "loss": 0.019, |
| "num_input_tokens_seen": 809978624, |
| "step": 791000 |
| }, |
| { |
| "epoch": 7.3029405523108295, |
| "grad_norm": 0.5718657374382019, |
| "learning_rate": 1.3485343371993247e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 810490624, |
| "step": 791500 |
| }, |
| { |
| "epoch": 7.307553907050129, |
| "grad_norm": 2.84344744682312, |
| "learning_rate": 1.346227659829675e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 811002624, |
| "step": 792000 |
| }, |
| { |
| "epoch": 7.312167261789428, |
| "grad_norm": 1.8831250667572021, |
| "learning_rate": 1.3439209824600255e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 811514624, |
| "step": 792500 |
| }, |
| { |
| "epoch": 7.316780616528727, |
| "grad_norm": 0.42998257279396057, |
| "learning_rate": 1.3416143050903757e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 812026624, |
| "step": 793000 |
| }, |
| { |
| "epoch": 7.3213939712680265, |
| "grad_norm": 0.4875911474227905, |
| "learning_rate": 1.3393076277207261e-05, |
| "loss": 0.0202, |
| "num_input_tokens_seen": 812538624, |
| "step": 793500 |
| }, |
| { |
| "epoch": 7.326007326007326, |
| "grad_norm": 0.6313169002532959, |
| "learning_rate": 1.3370009503510764e-05, |
| "loss": 0.0185, |
| "num_input_tokens_seen": 813050624, |
| "step": 794000 |
| }, |
| { |
| "epoch": 7.330620680746625, |
| "grad_norm": 0.5315720438957214, |
| "learning_rate": 1.3346942729814266e-05, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 813562624, |
| "step": 794500 |
| }, |
| { |
| "epoch": 7.335234035485925, |
| "grad_norm": 0.636077344417572, |
| "learning_rate": 1.332387595611777e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 814074624, |
| "step": 795000 |
| }, |
| { |
| "epoch": 7.339847390225224, |
| "grad_norm": 1.2620755434036255, |
| "learning_rate": 1.3300809182421272e-05, |
| "loss": 0.0185, |
| "num_input_tokens_seen": 814586624, |
| "step": 795500 |
| }, |
| { |
| "epoch": 7.344460744964524, |
| "grad_norm": 0.40610164403915405, |
| "learning_rate": 1.3277742408724778e-05, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 815098624, |
| "step": 796000 |
| }, |
| { |
| "epoch": 7.349074099703823, |
| "grad_norm": 0.5910019278526306, |
| "learning_rate": 1.325467563502828e-05, |
| "loss": 0.0191, |
| "num_input_tokens_seen": 815610624, |
| "step": 796500 |
| }, |
| { |
| "epoch": 7.353687454443122, |
| "grad_norm": 0.9699934720993042, |
| "learning_rate": 1.3231608861331784e-05, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 816122624, |
| "step": 797000 |
| }, |
| { |
| "epoch": 7.358300809182421, |
| "grad_norm": 0.5334429740905762, |
| "learning_rate": 1.3208542087635286e-05, |
| "loss": 0.0185, |
| "num_input_tokens_seen": 816634624, |
| "step": 797500 |
| }, |
| { |
| "epoch": 7.362914163921721, |
| "grad_norm": 0.47226250171661377, |
| "learning_rate": 1.3185475313938792e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 817146624, |
| "step": 798000 |
| }, |
| { |
| "epoch": 7.36752751866102, |
| "grad_norm": 3.1056435108184814, |
| "learning_rate": 1.3162408540242294e-05, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 817658624, |
| "step": 798500 |
| }, |
| { |
| "epoch": 7.372140873400319, |
| "grad_norm": 0.8559852838516235, |
| "learning_rate": 1.3139341766545798e-05, |
| "loss": 0.0186, |
| "num_input_tokens_seen": 818170624, |
| "step": 799000 |
| }, |
| { |
| "epoch": 7.376754228139618, |
| "grad_norm": 0.5092094540596008, |
| "learning_rate": 1.31162749928493e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 818682624, |
| "step": 799500 |
| }, |
| { |
| "epoch": 7.381367582878918, |
| "grad_norm": 0.7403343915939331, |
| "learning_rate": 1.3093208219152806e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 819194624, |
| "step": 800000 |
| }, |
| { |
| "epoch": 7.385980937618217, |
| "grad_norm": 1.0396490097045898, |
| "learning_rate": 1.3070141445456308e-05, |
| "loss": 0.0176, |
| "num_input_tokens_seen": 819706624, |
| "step": 800500 |
| }, |
| { |
| "epoch": 7.390594292357516, |
| "grad_norm": 1.229277491569519, |
| "learning_rate": 1.3047074671759809e-05, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 820218624, |
| "step": 801000 |
| }, |
| { |
| "epoch": 7.395207647096816, |
| "grad_norm": 1.870112419128418, |
| "learning_rate": 1.3024007898063315e-05, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 820730624, |
| "step": 801500 |
| }, |
| { |
| "epoch": 7.3998210018361155, |
| "grad_norm": 2.495352029800415, |
| "learning_rate": 1.3000941124366817e-05, |
| "loss": 0.0176, |
| "num_input_tokens_seen": 821242624, |
| "step": 802000 |
| }, |
| { |
| "epoch": 7.404434356575415, |
| "grad_norm": 1.2543821334838867, |
| "learning_rate": 1.2977874350670321e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 821754624, |
| "step": 802500 |
| }, |
| { |
| "epoch": 7.409047711314714, |
| "grad_norm": 0.9267345666885376, |
| "learning_rate": 1.2954807576973823e-05, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 822266624, |
| "step": 803000 |
| }, |
| { |
| "epoch": 7.413661066054013, |
| "grad_norm": 0.7813261151313782, |
| "learning_rate": 1.2931740803277329e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 822778624, |
| "step": 803500 |
| }, |
| { |
| "epoch": 7.4182744207933125, |
| "grad_norm": 2.1433377265930176, |
| "learning_rate": 1.2908674029580831e-05, |
| "loss": 0.0185, |
| "num_input_tokens_seen": 823290624, |
| "step": 804000 |
| }, |
| { |
| "epoch": 7.422887775532612, |
| "grad_norm": 0.4169975519180298, |
| "learning_rate": 1.2885607255884335e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 823802624, |
| "step": 804500 |
| }, |
| { |
| "epoch": 7.427501130271911, |
| "grad_norm": 0.7654904723167419, |
| "learning_rate": 1.2862540482187837e-05, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 824314624, |
| "step": 805000 |
| }, |
| { |
| "epoch": 7.43211448501121, |
| "grad_norm": 0.7712762355804443, |
| "learning_rate": 1.2839473708491343e-05, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 824826624, |
| "step": 805500 |
| }, |
| { |
| "epoch": 7.436727839750509, |
| "grad_norm": 1.179842233657837, |
| "learning_rate": 1.2816406934794845e-05, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 825338624, |
| "step": 806000 |
| }, |
| { |
| "epoch": 7.441341194489809, |
| "grad_norm": 1.1706069707870483, |
| "learning_rate": 1.279334016109835e-05, |
| "loss": 0.0191, |
| "num_input_tokens_seen": 825850624, |
| "step": 806500 |
| }, |
| { |
| "epoch": 7.445954549229109, |
| "grad_norm": 1.7458144426345825, |
| "learning_rate": 1.2770273387401852e-05, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 826362624, |
| "step": 807000 |
| }, |
| { |
| "epoch": 7.450567903968408, |
| "grad_norm": 0.8518096804618835, |
| "learning_rate": 1.2747206613705354e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 826874624, |
| "step": 807500 |
| }, |
| { |
| "epoch": 7.455181258707707, |
| "grad_norm": 0.6776919960975647, |
| "learning_rate": 1.2724139840008858e-05, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 827386624, |
| "step": 808000 |
| }, |
| { |
| "epoch": 7.459794613447007, |
| "grad_norm": 1.8147574663162231, |
| "learning_rate": 1.270107306631236e-05, |
| "loss": 0.0172, |
| "num_input_tokens_seen": 827898624, |
| "step": 808500 |
| }, |
| { |
| "epoch": 7.464407968186306, |
| "grad_norm": 0.730553150177002, |
| "learning_rate": 1.2678006292615866e-05, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 828410624, |
| "step": 809000 |
| }, |
| { |
| "epoch": 7.469021322925605, |
| "grad_norm": 0.5966499447822571, |
| "learning_rate": 1.2654939518919368e-05, |
| "loss": 0.0185, |
| "num_input_tokens_seen": 828922624, |
| "step": 809500 |
| }, |
| { |
| "epoch": 7.473634677664904, |
| "grad_norm": 0.5111476182937622, |
| "learning_rate": 1.2631872745222872e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 829434624, |
| "step": 810000 |
| }, |
| { |
| "epoch": 7.4782480324042035, |
| "grad_norm": 1.1634365320205688, |
| "learning_rate": 1.2608805971526374e-05, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 829946624, |
| "step": 810500 |
| }, |
| { |
| "epoch": 7.482861387143503, |
| "grad_norm": 1.030910611152649, |
| "learning_rate": 1.258573919782988e-05, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 830458624, |
| "step": 811000 |
| }, |
| { |
| "epoch": 7.487474741882802, |
| "grad_norm": 1.035938024520874, |
| "learning_rate": 1.2562672424133382e-05, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 830970624, |
| "step": 811500 |
| }, |
| { |
| "epoch": 7.492088096622101, |
| "grad_norm": 1.1685384511947632, |
| "learning_rate": 1.2539605650436886e-05, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 831482624, |
| "step": 812000 |
| }, |
| { |
| "epoch": 7.496701451361401, |
| "grad_norm": 0.8186880946159363, |
| "learning_rate": 1.2516538876740388e-05, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 831994624, |
| "step": 812500 |
| }, |
| { |
| "epoch": 7.501314806100701, |
| "grad_norm": 1.2309128046035767, |
| "learning_rate": 1.2493472103043892e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 832506624, |
| "step": 813000 |
| }, |
| { |
| "epoch": 7.50592816084, |
| "grad_norm": 0.9243940114974976, |
| "learning_rate": 1.2470405329347395e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 833018624, |
| "step": 813500 |
| }, |
| { |
| "epoch": 7.510541515579299, |
| "grad_norm": 1.5183156728744507, |
| "learning_rate": 1.2447338555650899e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 833530624, |
| "step": 814000 |
| }, |
| { |
| "epoch": 7.515154870318598, |
| "grad_norm": 0.7042239904403687, |
| "learning_rate": 1.2424271781954403e-05, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 834042624, |
| "step": 814500 |
| }, |
| { |
| "epoch": 7.519768225057898, |
| "grad_norm": 0.7798308730125427, |
| "learning_rate": 1.2401205008257907e-05, |
| "loss": 0.0203, |
| "num_input_tokens_seen": 834554624, |
| "step": 815000 |
| }, |
| { |
| "epoch": 7.524381579797197, |
| "grad_norm": 0.6466756463050842, |
| "learning_rate": 1.2378138234561409e-05, |
| "loss": 0.019, |
| "num_input_tokens_seen": 835066624, |
| "step": 815500 |
| }, |
| { |
| "epoch": 7.528994934536496, |
| "grad_norm": 1.0861841440200806, |
| "learning_rate": 1.2355071460864913e-05, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 835578624, |
| "step": 816000 |
| }, |
| { |
| "epoch": 7.533608289275795, |
| "grad_norm": 2.7624402046203613, |
| "learning_rate": 1.2332004687168417e-05, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 836090624, |
| "step": 816500 |
| }, |
| { |
| "epoch": 7.538221644015095, |
| "grad_norm": 1.2840367555618286, |
| "learning_rate": 1.2308937913471919e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 836602624, |
| "step": 817000 |
| }, |
| { |
| "epoch": 7.542834998754394, |
| "grad_norm": 0.6789388656616211, |
| "learning_rate": 1.2285871139775421e-05, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 837114624, |
| "step": 817500 |
| }, |
| { |
| "epoch": 7.547448353493694, |
| "grad_norm": 0.5279095768928528, |
| "learning_rate": 1.2262804366078925e-05, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 837626624, |
| "step": 818000 |
| }, |
| { |
| "epoch": 7.552061708232992, |
| "grad_norm": 0.5110554099082947, |
| "learning_rate": 1.223973759238243e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 838138624, |
| "step": 818500 |
| }, |
| { |
| "epoch": 7.5566750629722925, |
| "grad_norm": 1.535260796546936, |
| "learning_rate": 1.2216670818685932e-05, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 838650624, |
| "step": 819000 |
| }, |
| { |
| "epoch": 7.561288417711592, |
| "grad_norm": 3.005444049835205, |
| "learning_rate": 1.2193604044989436e-05, |
| "loss": 0.0194, |
| "num_input_tokens_seen": 839162624, |
| "step": 819500 |
| }, |
| { |
| "epoch": 7.565901772450891, |
| "grad_norm": 0.3890930712223053, |
| "learning_rate": 1.217053727129294e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 839674624, |
| "step": 820000 |
| }, |
| { |
| "epoch": 7.57051512719019, |
| "grad_norm": 3.0413002967834473, |
| "learning_rate": 1.2147470497596444e-05, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 840186624, |
| "step": 820500 |
| }, |
| { |
| "epoch": 7.5751284819294895, |
| "grad_norm": 0.33747154474258423, |
| "learning_rate": 1.2124403723899946e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 840698624, |
| "step": 821000 |
| }, |
| { |
| "epoch": 7.579741836668789, |
| "grad_norm": 0.7888673543930054, |
| "learning_rate": 1.210133695020345e-05, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 841210624, |
| "step": 821500 |
| }, |
| { |
| "epoch": 7.584355191408088, |
| "grad_norm": 0.5673322081565857, |
| "learning_rate": 1.2078270176506954e-05, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 841722624, |
| "step": 822000 |
| }, |
| { |
| "epoch": 7.588968546147387, |
| "grad_norm": 7.8960700035095215, |
| "learning_rate": 1.2055203402810456e-05, |
| "loss": 0.0171, |
| "num_input_tokens_seen": 842234624, |
| "step": 822500 |
| }, |
| { |
| "epoch": 7.5935819008866865, |
| "grad_norm": 0.6810684204101562, |
| "learning_rate": 1.203213662911396e-05, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 842746624, |
| "step": 823000 |
| }, |
| { |
| "epoch": 7.598195255625986, |
| "grad_norm": 0.88917076587677, |
| "learning_rate": 1.2009069855417462e-05, |
| "loss": 0.0176, |
| "num_input_tokens_seen": 843258624, |
| "step": 823500 |
| }, |
| { |
| "epoch": 7.602808610365285, |
| "grad_norm": 0.7236852049827576, |
| "learning_rate": 1.1986003081720966e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 843770624, |
| "step": 824000 |
| }, |
| { |
| "epoch": 7.607421965104585, |
| "grad_norm": 2.4100208282470703, |
| "learning_rate": 1.196293630802447e-05, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 844282624, |
| "step": 824500 |
| }, |
| { |
| "epoch": 7.612035319843884, |
| "grad_norm": 0.9818079471588135, |
| "learning_rate": 1.1939869534327972e-05, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 844794624, |
| "step": 825000 |
| }, |
| { |
| "epoch": 7.616648674583184, |
| "grad_norm": 5.109523773193359, |
| "learning_rate": 1.1916802760631476e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 845306624, |
| "step": 825500 |
| }, |
| { |
| "epoch": 7.621262029322483, |
| "grad_norm": 1.1535288095474243, |
| "learning_rate": 1.189373598693498e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 845818624, |
| "step": 826000 |
| }, |
| { |
| "epoch": 7.625875384061782, |
| "grad_norm": 1.0759390592575073, |
| "learning_rate": 1.1870669213238483e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 846330624, |
| "step": 826500 |
| }, |
| { |
| "epoch": 7.630488738801081, |
| "grad_norm": 0.9492645263671875, |
| "learning_rate": 1.1847602439541987e-05, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 846842624, |
| "step": 827000 |
| }, |
| { |
| "epoch": 7.635102093540381, |
| "grad_norm": 0.5077918767929077, |
| "learning_rate": 1.182453566584549e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 847354624, |
| "step": 827500 |
| }, |
| { |
| "epoch": 7.63971544827968, |
| "grad_norm": 0.5069125890731812, |
| "learning_rate": 1.1801468892148995e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 847866624, |
| "step": 828000 |
| }, |
| { |
| "epoch": 7.644328803018979, |
| "grad_norm": 0.35941779613494873, |
| "learning_rate": 1.1778402118452497e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 848378624, |
| "step": 828500 |
| }, |
| { |
| "epoch": 7.648942157758278, |
| "grad_norm": 0.7320166230201721, |
| "learning_rate": 1.1755335344756001e-05, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 848890624, |
| "step": 829000 |
| }, |
| { |
| "epoch": 7.653555512497578, |
| "grad_norm": 0.4909152686595917, |
| "learning_rate": 1.1732268571059505e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 849402624, |
| "step": 829500 |
| }, |
| { |
| "epoch": 7.658168867236878, |
| "grad_norm": 0.5299736857414246, |
| "learning_rate": 1.1709201797363007e-05, |
| "loss": 0.017, |
| "num_input_tokens_seen": 849914624, |
| "step": 830000 |
| }, |
| { |
| "epoch": 7.662782221976177, |
| "grad_norm": 1.6265432834625244, |
| "learning_rate": 1.168613502366651e-05, |
| "loss": 0.0192, |
| "num_input_tokens_seen": 850426624, |
| "step": 830500 |
| }, |
| { |
| "epoch": 7.667395576715476, |
| "grad_norm": 1.0842050313949585, |
| "learning_rate": 1.1663068249970013e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 850938624, |
| "step": 831000 |
| }, |
| { |
| "epoch": 7.6720089314547755, |
| "grad_norm": 0.46629172563552856, |
| "learning_rate": 1.1640001476273517e-05, |
| "loss": 0.019, |
| "num_input_tokens_seen": 851450624, |
| "step": 831500 |
| }, |
| { |
| "epoch": 7.676622286194075, |
| "grad_norm": 0.786178469657898, |
| "learning_rate": 1.161693470257702e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 851962624, |
| "step": 832000 |
| }, |
| { |
| "epoch": 7.681235640933374, |
| "grad_norm": 0.9928342700004578, |
| "learning_rate": 1.1593867928880524e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 852474624, |
| "step": 832500 |
| }, |
| { |
| "epoch": 7.685848995672673, |
| "grad_norm": 0.19910675287246704, |
| "learning_rate": 1.1570801155184028e-05, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 852986624, |
| "step": 833000 |
| }, |
| { |
| "epoch": 7.6904623504119725, |
| "grad_norm": 0.44422009587287903, |
| "learning_rate": 1.1547734381487532e-05, |
| "loss": 0.019, |
| "num_input_tokens_seen": 853498624, |
| "step": 833500 |
| }, |
| { |
| "epoch": 7.695075705151272, |
| "grad_norm": 1.4326293468475342, |
| "learning_rate": 1.1524667607791034e-05, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 854010624, |
| "step": 834000 |
| }, |
| { |
| "epoch": 7.699689059890571, |
| "grad_norm": 2.208235263824463, |
| "learning_rate": 1.1501600834094538e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 854522624, |
| "step": 834500 |
| }, |
| { |
| "epoch": 7.70430241462987, |
| "grad_norm": 1.5056183338165283, |
| "learning_rate": 1.1478534060398042e-05, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 855034624, |
| "step": 835000 |
| }, |
| { |
| "epoch": 7.70891576936917, |
| "grad_norm": 0.991448700428009, |
| "learning_rate": 1.1455467286701544e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 855546624, |
| "step": 835500 |
| }, |
| { |
| "epoch": 7.71352912410847, |
| "grad_norm": 0.48746320605278015, |
| "learning_rate": 1.1432400513005048e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 856058624, |
| "step": 836000 |
| }, |
| { |
| "epoch": 7.718142478847769, |
| "grad_norm": 0.7954283356666565, |
| "learning_rate": 1.140933373930855e-05, |
| "loss": 0.0186, |
| "num_input_tokens_seen": 856570624, |
| "step": 836500 |
| }, |
| { |
| "epoch": 7.722755833587068, |
| "grad_norm": 0.3314274251461029, |
| "learning_rate": 1.1386266965612054e-05, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 857082624, |
| "step": 837000 |
| }, |
| { |
| "epoch": 7.727369188326367, |
| "grad_norm": 0.40846577286720276, |
| "learning_rate": 1.1363200191915556e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 857594624, |
| "step": 837500 |
| }, |
| { |
| "epoch": 7.731982543065667, |
| "grad_norm": 0.5026475787162781, |
| "learning_rate": 1.134013341821906e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 858106624, |
| "step": 838000 |
| }, |
| { |
| "epoch": 7.736595897804966, |
| "grad_norm": 0.7746123671531677, |
| "learning_rate": 1.1317066644522564e-05, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 858618624, |
| "step": 838500 |
| }, |
| { |
| "epoch": 7.741209252544265, |
| "grad_norm": 0.835455060005188, |
| "learning_rate": 1.1293999870826068e-05, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 859130624, |
| "step": 839000 |
| }, |
| { |
| "epoch": 7.745822607283564, |
| "grad_norm": 1.107001781463623, |
| "learning_rate": 1.127093309712957e-05, |
| "loss": 0.0201, |
| "num_input_tokens_seen": 859642624, |
| "step": 839500 |
| }, |
| { |
| "epoch": 7.750435962022864, |
| "grad_norm": 0.31434282660484314, |
| "learning_rate": 1.1247866323433075e-05, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 860154624, |
| "step": 840000 |
| }, |
| { |
| "epoch": 7.755049316762163, |
| "grad_norm": 0.7980784773826599, |
| "learning_rate": 1.1224799549736579e-05, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 860666624, |
| "step": 840500 |
| }, |
| { |
| "epoch": 7.759662671501462, |
| "grad_norm": 0.6341221332550049, |
| "learning_rate": 1.1201732776040081e-05, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 861178624, |
| "step": 841000 |
| }, |
| { |
| "epoch": 7.764276026240761, |
| "grad_norm": 1.298004388809204, |
| "learning_rate": 1.1178666002343585e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 861690624, |
| "step": 841500 |
| }, |
| { |
| "epoch": 7.768889380980061, |
| "grad_norm": 0.6212522983551025, |
| "learning_rate": 1.1155599228647089e-05, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 862202624, |
| "step": 842000 |
| }, |
| { |
| "epoch": 7.773502735719361, |
| "grad_norm": 1.0448174476623535, |
| "learning_rate": 1.1132532454950593e-05, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 862714624, |
| "step": 842500 |
| }, |
| { |
| "epoch": 7.77811609045866, |
| "grad_norm": 0.4349260628223419, |
| "learning_rate": 1.1109465681254095e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 863226624, |
| "step": 843000 |
| }, |
| { |
| "epoch": 7.782729445197959, |
| "grad_norm": 0.5279752016067505, |
| "learning_rate": 1.1086398907557597e-05, |
| "loss": 0.0191, |
| "num_input_tokens_seen": 863738624, |
| "step": 843500 |
| }, |
| { |
| "epoch": 7.787342799937258, |
| "grad_norm": 2.5519967079162598, |
| "learning_rate": 1.1063332133861101e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 864250624, |
| "step": 844000 |
| }, |
| { |
| "epoch": 7.791956154676558, |
| "grad_norm": 1.002515435218811, |
| "learning_rate": 1.1040265360164605e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 864762624, |
| "step": 844500 |
| }, |
| { |
| "epoch": 7.796569509415857, |
| "grad_norm": 1.0723029375076294, |
| "learning_rate": 1.1017198586468108e-05, |
| "loss": 0.0172, |
| "num_input_tokens_seen": 865274624, |
| "step": 845000 |
| }, |
| { |
| "epoch": 7.801182864155156, |
| "grad_norm": 0.492806613445282, |
| "learning_rate": 1.0994131812771612e-05, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 865786624, |
| "step": 845500 |
| }, |
| { |
| "epoch": 7.805796218894455, |
| "grad_norm": 2.1584246158599854, |
| "learning_rate": 1.0971065039075116e-05, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 866298624, |
| "step": 846000 |
| }, |
| { |
| "epoch": 7.810409573633755, |
| "grad_norm": 0.9871762990951538, |
| "learning_rate": 1.094799826537862e-05, |
| "loss": 0.0172, |
| "num_input_tokens_seen": 866810624, |
| "step": 846500 |
| }, |
| { |
| "epoch": 7.815022928373054, |
| "grad_norm": 1.234832525253296, |
| "learning_rate": 1.0924931491682122e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 867322624, |
| "step": 847000 |
| }, |
| { |
| "epoch": 7.819636283112354, |
| "grad_norm": 0.8536167144775391, |
| "learning_rate": 1.0901864717985626e-05, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 867834624, |
| "step": 847500 |
| }, |
| { |
| "epoch": 7.824249637851653, |
| "grad_norm": 0.5045762658119202, |
| "learning_rate": 1.087879794428913e-05, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 868346624, |
| "step": 848000 |
| }, |
| { |
| "epoch": 7.8288629925909525, |
| "grad_norm": 0.539504885673523, |
| "learning_rate": 1.0855731170592632e-05, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 868858624, |
| "step": 848500 |
| }, |
| { |
| "epoch": 7.833476347330252, |
| "grad_norm": 0.6124027967453003, |
| "learning_rate": 1.0832664396896136e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 869370624, |
| "step": 849000 |
| }, |
| { |
| "epoch": 7.838089702069551, |
| "grad_norm": 0.5063890814781189, |
| "learning_rate": 1.0809597623199638e-05, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 869882624, |
| "step": 849500 |
| }, |
| { |
| "epoch": 7.84270305680885, |
| "grad_norm": 0.4935370087623596, |
| "learning_rate": 1.0786530849503142e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 870394624, |
| "step": 850000 |
| }, |
| { |
| "epoch": 7.8473164115481495, |
| "grad_norm": 1.3337877988815308, |
| "learning_rate": 1.0763464075806644e-05, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 870906624, |
| "step": 850500 |
| }, |
| { |
| "epoch": 7.851929766287449, |
| "grad_norm": 0.5984758734703064, |
| "learning_rate": 1.0740397302110148e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 871418624, |
| "step": 851000 |
| }, |
| { |
| "epoch": 7.856543121026748, |
| "grad_norm": 0.6499104499816895, |
| "learning_rate": 1.0717330528413652e-05, |
| "loss": 0.0176, |
| "num_input_tokens_seen": 871930624, |
| "step": 851500 |
| }, |
| { |
| "epoch": 7.861156475766047, |
| "grad_norm": 0.5723326206207275, |
| "learning_rate": 1.0694263754717156e-05, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 872442624, |
| "step": 852000 |
| }, |
| { |
| "epoch": 7.8657698305053465, |
| "grad_norm": 0.6458103060722351, |
| "learning_rate": 1.0671196981020659e-05, |
| "loss": 0.0185, |
| "num_input_tokens_seen": 872954624, |
| "step": 852500 |
| }, |
| { |
| "epoch": 7.870383185244647, |
| "grad_norm": 0.6607184410095215, |
| "learning_rate": 1.0648130207324163e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 873466624, |
| "step": 853000 |
| }, |
| { |
| "epoch": 7.874996539983946, |
| "grad_norm": 0.7945510745048523, |
| "learning_rate": 1.0625063433627667e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 873978624, |
| "step": 853500 |
| }, |
| { |
| "epoch": 7.879609894723245, |
| "grad_norm": 0.9480940103530884, |
| "learning_rate": 1.0601996659931169e-05, |
| "loss": 0.019, |
| "num_input_tokens_seen": 874490624, |
| "step": 854000 |
| }, |
| { |
| "epoch": 7.884223249462544, |
| "grad_norm": 0.5195125937461853, |
| "learning_rate": 1.0578929886234673e-05, |
| "loss": 0.017, |
| "num_input_tokens_seen": 875002624, |
| "step": 854500 |
| }, |
| { |
| "epoch": 7.888836604201844, |
| "grad_norm": 0.3116241693496704, |
| "learning_rate": 1.0555863112538177e-05, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 875514624, |
| "step": 855000 |
| }, |
| { |
| "epoch": 7.893449958941143, |
| "grad_norm": 0.8278101086616516, |
| "learning_rate": 1.053279633884168e-05, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 876026624, |
| "step": 855500 |
| }, |
| { |
| "epoch": 7.898063313680442, |
| "grad_norm": 0.6848555207252502, |
| "learning_rate": 1.0509729565145181e-05, |
| "loss": 0.0186, |
| "num_input_tokens_seen": 876538624, |
| "step": 856000 |
| }, |
| { |
| "epoch": 7.902676668419741, |
| "grad_norm": 0.9749637842178345, |
| "learning_rate": 1.0486662791448685e-05, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 877050624, |
| "step": 856500 |
| }, |
| { |
| "epoch": 7.907290023159041, |
| "grad_norm": 2.486924648284912, |
| "learning_rate": 1.046359601775219e-05, |
| "loss": 0.0194, |
| "num_input_tokens_seen": 877562624, |
| "step": 857000 |
| }, |
| { |
| "epoch": 7.91190337789834, |
| "grad_norm": 0.8250918388366699, |
| "learning_rate": 1.0440529244055693e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 878074624, |
| "step": 857500 |
| }, |
| { |
| "epoch": 7.916516732637639, |
| "grad_norm": 1.9874022006988525, |
| "learning_rate": 1.0417462470359196e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 878586624, |
| "step": 858000 |
| }, |
| { |
| "epoch": 7.921130087376939, |
| "grad_norm": 1.451173186302185, |
| "learning_rate": 1.03943956966627e-05, |
| "loss": 0.0191, |
| "num_input_tokens_seen": 879098624, |
| "step": 858500 |
| }, |
| { |
| "epoch": 7.925743442116238, |
| "grad_norm": 3.8313064575195312, |
| "learning_rate": 1.0371328922966204e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 879610624, |
| "step": 859000 |
| }, |
| { |
| "epoch": 7.930356796855538, |
| "grad_norm": 0.9106965661048889, |
| "learning_rate": 1.0348262149269706e-05, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 880122624, |
| "step": 859500 |
| }, |
| { |
| "epoch": 7.934970151594837, |
| "grad_norm": 0.9856759905815125, |
| "learning_rate": 1.032519537557321e-05, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 880634624, |
| "step": 860000 |
| }, |
| { |
| "epoch": 7.939583506334136, |
| "grad_norm": 1.1179744005203247, |
| "learning_rate": 1.0302128601876714e-05, |
| "loss": 0.0186, |
| "num_input_tokens_seen": 881146624, |
| "step": 860500 |
| }, |
| { |
| "epoch": 7.9441968610734355, |
| "grad_norm": 0.8333301544189453, |
| "learning_rate": 1.0279061828180218e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 881658624, |
| "step": 861000 |
| }, |
| { |
| "epoch": 7.948810215812735, |
| "grad_norm": 0.4756206274032593, |
| "learning_rate": 1.025599505448372e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 882170624, |
| "step": 861500 |
| }, |
| { |
| "epoch": 7.953423570552034, |
| "grad_norm": 1.3627671003341675, |
| "learning_rate": 1.0232928280787224e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 882682624, |
| "step": 862000 |
| }, |
| { |
| "epoch": 7.958036925291333, |
| "grad_norm": 1.3066837787628174, |
| "learning_rate": 1.0209861507090726e-05, |
| "loss": 0.0193, |
| "num_input_tokens_seen": 883194624, |
| "step": 862500 |
| }, |
| { |
| "epoch": 7.9626502800306325, |
| "grad_norm": 0.46038496494293213, |
| "learning_rate": 1.018679473339423e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 883706624, |
| "step": 863000 |
| }, |
| { |
| "epoch": 7.967263634769932, |
| "grad_norm": 0.67403644323349, |
| "learning_rate": 1.0163727959697732e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 884218624, |
| "step": 863500 |
| }, |
| { |
| "epoch": 7.971876989509231, |
| "grad_norm": 0.7785734534263611, |
| "learning_rate": 1.0140661186001236e-05, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 884730624, |
| "step": 864000 |
| }, |
| { |
| "epoch": 7.97649034424853, |
| "grad_norm": 0.8497280478477478, |
| "learning_rate": 1.011759441230474e-05, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 885242624, |
| "step": 864500 |
| }, |
| { |
| "epoch": 7.98110369898783, |
| "grad_norm": 4.073908805847168, |
| "learning_rate": 1.0094527638608243e-05, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 885754624, |
| "step": 865000 |
| }, |
| { |
| "epoch": 7.98571705372713, |
| "grad_norm": 0.7901633977890015, |
| "learning_rate": 1.0071460864911747e-05, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 886266624, |
| "step": 865500 |
| }, |
| { |
| "epoch": 7.990330408466429, |
| "grad_norm": 2.1585545539855957, |
| "learning_rate": 1.004839409121525e-05, |
| "loss": 0.0172, |
| "num_input_tokens_seen": 886778624, |
| "step": 866000 |
| }, |
| { |
| "epoch": 7.994943763205728, |
| "grad_norm": 0.6002645492553711, |
| "learning_rate": 1.0025327317518755e-05, |
| "loss": 0.0193, |
| "num_input_tokens_seen": 887290624, |
| "step": 866500 |
| }, |
| { |
| "epoch": 7.999557117945027, |
| "grad_norm": 0.602433443069458, |
| "learning_rate": 1.0002260543822257e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 887802624, |
| "step": 867000 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_combined_score": 0.0675718570300666, |
| "eval_loss": 0.06757185608148575, |
| "eval_mse": 0.06757185797864745, |
| "eval_runtime": 46.9325, |
| "eval_samples_per_second": 2052.691, |
| "eval_steps_per_second": 256.602, |
| "num_input_tokens_seen": 887851008, |
| "step": 867048 |
| }, |
| { |
| "epoch": 8.004170472684327, |
| "grad_norm": 1.5062319040298462, |
| "learning_rate": 9.97919377012576e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 888313856, |
| "step": 867500 |
| }, |
| { |
| "epoch": 8.008783827423626, |
| "grad_norm": 0.3685579001903534, |
| "learning_rate": 9.956126996429265e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 888825856, |
| "step": 868000 |
| }, |
| { |
| "epoch": 8.013397182162926, |
| "grad_norm": 0.5031562447547913, |
| "learning_rate": 9.933060222732767e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 889337856, |
| "step": 868500 |
| }, |
| { |
| "epoch": 8.018010536902224, |
| "grad_norm": 1.041576623916626, |
| "learning_rate": 9.90999344903627e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 889849856, |
| "step": 869000 |
| }, |
| { |
| "epoch": 8.022623891641524, |
| "grad_norm": 0.6168863773345947, |
| "learning_rate": 9.886926675339773e-06, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 890361856, |
| "step": 869500 |
| }, |
| { |
| "epoch": 8.027237246380823, |
| "grad_norm": 1.0457834005355835, |
| "learning_rate": 9.863859901643277e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 890873856, |
| "step": 870000 |
| }, |
| { |
| "epoch": 8.031850601120123, |
| "grad_norm": 0.7371172904968262, |
| "learning_rate": 9.840793127946781e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 891385856, |
| "step": 870500 |
| }, |
| { |
| "epoch": 8.036463955859421, |
| "grad_norm": 1.857638955116272, |
| "learning_rate": 9.817726354250284e-06, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 891897856, |
| "step": 871000 |
| }, |
| { |
| "epoch": 8.041077310598721, |
| "grad_norm": 1.3631207942962646, |
| "learning_rate": 9.794659580553788e-06, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 892409856, |
| "step": 871500 |
| }, |
| { |
| "epoch": 8.04569066533802, |
| "grad_norm": 1.4387595653533936, |
| "learning_rate": 9.771592806857291e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 892921856, |
| "step": 872000 |
| }, |
| { |
| "epoch": 8.05030402007732, |
| "grad_norm": 0.44265180826187134, |
| "learning_rate": 9.748526033160794e-06, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 893433856, |
| "step": 872500 |
| }, |
| { |
| "epoch": 8.054917374816618, |
| "grad_norm": 0.7352337837219238, |
| "learning_rate": 9.725459259464298e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 893945856, |
| "step": 873000 |
| }, |
| { |
| "epoch": 8.059530729555918, |
| "grad_norm": 0.6806060075759888, |
| "learning_rate": 9.702392485767802e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 894457856, |
| "step": 873500 |
| }, |
| { |
| "epoch": 8.064144084295219, |
| "grad_norm": 0.7403847575187683, |
| "learning_rate": 9.679325712071306e-06, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 894969856, |
| "step": 874000 |
| }, |
| { |
| "epoch": 8.068757439034517, |
| "grad_norm": 1.1141221523284912, |
| "learning_rate": 9.656258938374808e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 895481856, |
| "step": 874500 |
| }, |
| { |
| "epoch": 8.073370793773817, |
| "grad_norm": 0.983514130115509, |
| "learning_rate": 9.633192164678312e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 895993856, |
| "step": 875000 |
| }, |
| { |
| "epoch": 8.077984148513115, |
| "grad_norm": 0.4191863536834717, |
| "learning_rate": 9.610125390981814e-06, |
| "loss": 0.0171, |
| "num_input_tokens_seen": 896505856, |
| "step": 875500 |
| }, |
| { |
| "epoch": 8.082597503252416, |
| "grad_norm": 0.4481130540370941, |
| "learning_rate": 9.587058617285318e-06, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 897017856, |
| "step": 876000 |
| }, |
| { |
| "epoch": 8.087210857991714, |
| "grad_norm": 0.7153156995773315, |
| "learning_rate": 9.56399184358882e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 897529856, |
| "step": 876500 |
| }, |
| { |
| "epoch": 8.091824212731014, |
| "grad_norm": 1.7068063020706177, |
| "learning_rate": 9.540925069892324e-06, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 898041856, |
| "step": 877000 |
| }, |
| { |
| "epoch": 8.096437567470312, |
| "grad_norm": 0.5899567008018494, |
| "learning_rate": 9.517858296195828e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 898553856, |
| "step": 877500 |
| }, |
| { |
| "epoch": 8.101050922209613, |
| "grad_norm": 0.9179006218910217, |
| "learning_rate": 9.49479152249933e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 899065856, |
| "step": 878000 |
| }, |
| { |
| "epoch": 8.105664276948911, |
| "grad_norm": 0.7641995549201965, |
| "learning_rate": 9.471724748802835e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 899577856, |
| "step": 878500 |
| }, |
| { |
| "epoch": 8.110277631688211, |
| "grad_norm": 0.8679375648498535, |
| "learning_rate": 9.448657975106339e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 900089856, |
| "step": 879000 |
| }, |
| { |
| "epoch": 8.114890986427511, |
| "grad_norm": 0.981959342956543, |
| "learning_rate": 9.425591201409843e-06, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 900601856, |
| "step": 879500 |
| }, |
| { |
| "epoch": 8.11950434116681, |
| "grad_norm": 0.5581063032150269, |
| "learning_rate": 9.402524427713345e-06, |
| "loss": 0.0153, |
| "num_input_tokens_seen": 901113856, |
| "step": 880000 |
| }, |
| { |
| "epoch": 8.12411769590611, |
| "grad_norm": 0.4459242522716522, |
| "learning_rate": 9.379457654016849e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 901625856, |
| "step": 880500 |
| }, |
| { |
| "epoch": 8.128731050645408, |
| "grad_norm": 0.5052184462547302, |
| "learning_rate": 9.356390880320353e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 902137856, |
| "step": 881000 |
| }, |
| { |
| "epoch": 8.133344405384708, |
| "grad_norm": 2.326282024383545, |
| "learning_rate": 9.333324106623855e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 902649856, |
| "step": 881500 |
| }, |
| { |
| "epoch": 8.137957760124007, |
| "grad_norm": 0.3621096909046173, |
| "learning_rate": 9.310257332927357e-06, |
| "loss": 0.015, |
| "num_input_tokens_seen": 903161856, |
| "step": 882000 |
| }, |
| { |
| "epoch": 8.142571114863307, |
| "grad_norm": 0.599589467048645, |
| "learning_rate": 9.287190559230861e-06, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 903673856, |
| "step": 882500 |
| }, |
| { |
| "epoch": 8.147184469602605, |
| "grad_norm": 0.6334195733070374, |
| "learning_rate": 9.264123785534365e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 904185856, |
| "step": 883000 |
| }, |
| { |
| "epoch": 8.151797824341905, |
| "grad_norm": 0.5166653990745544, |
| "learning_rate": 9.241057011837868e-06, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 904697856, |
| "step": 883500 |
| }, |
| { |
| "epoch": 8.156411179081203, |
| "grad_norm": 0.9574226140975952, |
| "learning_rate": 9.217990238141372e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 905209856, |
| "step": 884000 |
| }, |
| { |
| "epoch": 8.161024533820504, |
| "grad_norm": 0.7625335454940796, |
| "learning_rate": 9.194923464444876e-06, |
| "loss": 0.0172, |
| "num_input_tokens_seen": 905721856, |
| "step": 884500 |
| }, |
| { |
| "epoch": 8.165637888559804, |
| "grad_norm": 0.5956442356109619, |
| "learning_rate": 9.17185669074838e-06, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 906233856, |
| "step": 885000 |
| }, |
| { |
| "epoch": 8.170251243299102, |
| "grad_norm": 0.6293473243713379, |
| "learning_rate": 9.148789917051882e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 906745856, |
| "step": 885500 |
| }, |
| { |
| "epoch": 8.174864598038402, |
| "grad_norm": 1.834021806716919, |
| "learning_rate": 9.125723143355386e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 907257856, |
| "step": 886000 |
| }, |
| { |
| "epoch": 8.1794779527777, |
| "grad_norm": 0.4335891008377075, |
| "learning_rate": 9.10265636965889e-06, |
| "loss": 0.0161, |
| "num_input_tokens_seen": 907769856, |
| "step": 886500 |
| }, |
| { |
| "epoch": 8.184091307517, |
| "grad_norm": 0.573677659034729, |
| "learning_rate": 9.079589595962392e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 908281856, |
| "step": 887000 |
| }, |
| { |
| "epoch": 8.1887046622563, |
| "grad_norm": 0.7976333498954773, |
| "learning_rate": 9.056522822265896e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 908793856, |
| "step": 887500 |
| }, |
| { |
| "epoch": 8.1933180169956, |
| "grad_norm": 1.0269770622253418, |
| "learning_rate": 9.0334560485694e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 909305856, |
| "step": 888000 |
| }, |
| { |
| "epoch": 8.197931371734898, |
| "grad_norm": 0.9196085333824158, |
| "learning_rate": 9.010389274872902e-06, |
| "loss": 0.0161, |
| "num_input_tokens_seen": 909817856, |
| "step": 888500 |
| }, |
| { |
| "epoch": 8.202544726474198, |
| "grad_norm": 0.9371418952941895, |
| "learning_rate": 8.987322501176406e-06, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 910329856, |
| "step": 889000 |
| }, |
| { |
| "epoch": 8.207158081213496, |
| "grad_norm": 0.5787968635559082, |
| "learning_rate": 8.964255727479908e-06, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 910841856, |
| "step": 889500 |
| }, |
| { |
| "epoch": 8.211771435952796, |
| "grad_norm": 0.44304850697517395, |
| "learning_rate": 8.941188953783412e-06, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 911353856, |
| "step": 890000 |
| }, |
| { |
| "epoch": 8.216384790692096, |
| "grad_norm": 1.7044280767440796, |
| "learning_rate": 8.918122180086916e-06, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 911865856, |
| "step": 890500 |
| }, |
| { |
| "epoch": 8.220998145431395, |
| "grad_norm": 0.6133010983467102, |
| "learning_rate": 8.895055406390419e-06, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 912377856, |
| "step": 891000 |
| }, |
| { |
| "epoch": 8.225611500170695, |
| "grad_norm": 2.290767192840576, |
| "learning_rate": 8.871988632693923e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 912889856, |
| "step": 891500 |
| }, |
| { |
| "epoch": 8.230224854909993, |
| "grad_norm": 0.47266674041748047, |
| "learning_rate": 8.848921858997427e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 913401856, |
| "step": 892000 |
| }, |
| { |
| "epoch": 8.234838209649293, |
| "grad_norm": 0.7107419967651367, |
| "learning_rate": 8.82585508530093e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 913913856, |
| "step": 892500 |
| }, |
| { |
| "epoch": 8.239451564388592, |
| "grad_norm": 0.29794007539749146, |
| "learning_rate": 8.802788311604433e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 914425856, |
| "step": 893000 |
| }, |
| { |
| "epoch": 8.244064919127892, |
| "grad_norm": 0.9938859939575195, |
| "learning_rate": 8.779721537907937e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 914937856, |
| "step": 893500 |
| }, |
| { |
| "epoch": 8.24867827386719, |
| "grad_norm": 0.9996763467788696, |
| "learning_rate": 8.75665476421144e-06, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 915449856, |
| "step": 894000 |
| }, |
| { |
| "epoch": 8.25329162860649, |
| "grad_norm": 0.8853555917739868, |
| "learning_rate": 8.733587990514943e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 915961856, |
| "step": 894500 |
| }, |
| { |
| "epoch": 8.257904983345789, |
| "grad_norm": 0.5720754861831665, |
| "learning_rate": 8.710521216818445e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 916473856, |
| "step": 895000 |
| }, |
| { |
| "epoch": 8.262518338085089, |
| "grad_norm": 0.7386252880096436, |
| "learning_rate": 8.68745444312195e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 916985856, |
| "step": 895500 |
| }, |
| { |
| "epoch": 8.267131692824387, |
| "grad_norm": 0.5073798298835754, |
| "learning_rate": 8.664387669425453e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 917497856, |
| "step": 896000 |
| }, |
| { |
| "epoch": 8.271745047563687, |
| "grad_norm": 2.3658652305603027, |
| "learning_rate": 8.641320895728956e-06, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 918009856, |
| "step": 896500 |
| }, |
| { |
| "epoch": 8.276358402302987, |
| "grad_norm": 0.4761596620082855, |
| "learning_rate": 8.61825412203246e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 918521856, |
| "step": 897000 |
| }, |
| { |
| "epoch": 8.280971757042286, |
| "grad_norm": 0.5883774161338806, |
| "learning_rate": 8.595187348335963e-06, |
| "loss": 0.0153, |
| "num_input_tokens_seen": 919033856, |
| "step": 897500 |
| }, |
| { |
| "epoch": 8.285585111781586, |
| "grad_norm": 0.9515103101730347, |
| "learning_rate": 8.572120574639467e-06, |
| "loss": 0.015, |
| "num_input_tokens_seen": 919545856, |
| "step": 898000 |
| }, |
| { |
| "epoch": 8.290198466520884, |
| "grad_norm": 0.5109001398086548, |
| "learning_rate": 8.54905380094297e-06, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 920057856, |
| "step": 898500 |
| }, |
| { |
| "epoch": 8.294811821260184, |
| "grad_norm": 0.8202781081199646, |
| "learning_rate": 8.525987027246474e-06, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 920569856, |
| "step": 899000 |
| }, |
| { |
| "epoch": 8.299425175999483, |
| "grad_norm": 1.913580060005188, |
| "learning_rate": 8.502920253549978e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 921081856, |
| "step": 899500 |
| }, |
| { |
| "epoch": 8.304038530738783, |
| "grad_norm": 0.6409407258033752, |
| "learning_rate": 8.47985347985348e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 921593856, |
| "step": 900000 |
| }, |
| { |
| "epoch": 8.308651885478081, |
| "grad_norm": 0.4128231108188629, |
| "learning_rate": 8.456786706156984e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 922105856, |
| "step": 900500 |
| }, |
| { |
| "epoch": 8.313265240217381, |
| "grad_norm": 2.3555517196655273, |
| "learning_rate": 8.433719932460488e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 922617856, |
| "step": 901000 |
| }, |
| { |
| "epoch": 8.31787859495668, |
| "grad_norm": 1.5205661058425903, |
| "learning_rate": 8.41065315876399e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 923129856, |
| "step": 901500 |
| }, |
| { |
| "epoch": 8.32249194969598, |
| "grad_norm": 0.8352044224739075, |
| "learning_rate": 8.387586385067492e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 923641856, |
| "step": 902000 |
| }, |
| { |
| "epoch": 8.32710530443528, |
| "grad_norm": 0.256552129983902, |
| "learning_rate": 8.364519611370996e-06, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 924153856, |
| "step": 902500 |
| }, |
| { |
| "epoch": 8.331718659174578, |
| "grad_norm": 0.9458514451980591, |
| "learning_rate": 8.3414528376745e-06, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 924665856, |
| "step": 903000 |
| }, |
| { |
| "epoch": 8.336332013913879, |
| "grad_norm": 0.9356163740158081, |
| "learning_rate": 8.318386063978004e-06, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 925177856, |
| "step": 903500 |
| }, |
| { |
| "epoch": 8.340945368653177, |
| "grad_norm": 0.6801881790161133, |
| "learning_rate": 8.295319290281507e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 925689856, |
| "step": 904000 |
| }, |
| { |
| "epoch": 8.345558723392477, |
| "grad_norm": 1.2119888067245483, |
| "learning_rate": 8.27225251658501e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 926201856, |
| "step": 904500 |
| }, |
| { |
| "epoch": 8.350172078131775, |
| "grad_norm": 0.6034347414970398, |
| "learning_rate": 8.249185742888515e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 926713856, |
| "step": 905000 |
| }, |
| { |
| "epoch": 8.354785432871076, |
| "grad_norm": 0.47974085807800293, |
| "learning_rate": 8.226118969192017e-06, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 927225856, |
| "step": 905500 |
| }, |
| { |
| "epoch": 8.359398787610374, |
| "grad_norm": 0.7787156105041504, |
| "learning_rate": 8.20305219549552e-06, |
| "loss": 0.0161, |
| "num_input_tokens_seen": 927737856, |
| "step": 906000 |
| }, |
| { |
| "epoch": 8.364012142349674, |
| "grad_norm": 0.8252438306808472, |
| "learning_rate": 8.179985421799025e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 928249856, |
| "step": 906500 |
| }, |
| { |
| "epoch": 8.368625497088972, |
| "grad_norm": 1.7516320943832397, |
| "learning_rate": 8.156918648102529e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 928761856, |
| "step": 907000 |
| }, |
| { |
| "epoch": 8.373238851828273, |
| "grad_norm": 0.9089247584342957, |
| "learning_rate": 8.133851874406031e-06, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 929273856, |
| "step": 907500 |
| }, |
| { |
| "epoch": 8.377852206567571, |
| "grad_norm": 0.5961917042732239, |
| "learning_rate": 8.110785100709533e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 929785856, |
| "step": 908000 |
| }, |
| { |
| "epoch": 8.382465561306871, |
| "grad_norm": 0.9045282602310181, |
| "learning_rate": 8.087718327013037e-06, |
| "loss": 0.015, |
| "num_input_tokens_seen": 930297856, |
| "step": 908500 |
| }, |
| { |
| "epoch": 8.387078916046171, |
| "grad_norm": 2.7716050148010254, |
| "learning_rate": 8.064651553316541e-06, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 930809856, |
| "step": 909000 |
| }, |
| { |
| "epoch": 8.39169227078547, |
| "grad_norm": 0.8180987238883972, |
| "learning_rate": 8.041584779620044e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 931321856, |
| "step": 909500 |
| }, |
| { |
| "epoch": 8.39630562552477, |
| "grad_norm": 0.8871789574623108, |
| "learning_rate": 8.018518005923548e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 931833856, |
| "step": 910000 |
| }, |
| { |
| "epoch": 8.400918980264068, |
| "grad_norm": 0.9161932468414307, |
| "learning_rate": 7.995451232227051e-06, |
| "loss": 0.0153, |
| "num_input_tokens_seen": 932345856, |
| "step": 910500 |
| }, |
| { |
| "epoch": 8.405532335003368, |
| "grad_norm": 0.6723649501800537, |
| "learning_rate": 7.972384458530555e-06, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 932857856, |
| "step": 911000 |
| }, |
| { |
| "epoch": 8.410145689742667, |
| "grad_norm": 0.57211834192276, |
| "learning_rate": 7.949317684834058e-06, |
| "loss": 0.015, |
| "num_input_tokens_seen": 933369856, |
| "step": 911500 |
| }, |
| { |
| "epoch": 8.414759044481967, |
| "grad_norm": 0.7815681099891663, |
| "learning_rate": 7.926250911137562e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 933881856, |
| "step": 912000 |
| }, |
| { |
| "epoch": 8.419372399221265, |
| "grad_norm": 1.4835954904556274, |
| "learning_rate": 7.903184137441066e-06, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 934393856, |
| "step": 912500 |
| }, |
| { |
| "epoch": 8.423985753960565, |
| "grad_norm": 0.6556302905082703, |
| "learning_rate": 7.880117363744568e-06, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 934905856, |
| "step": 913000 |
| }, |
| { |
| "epoch": 8.428599108699864, |
| "grad_norm": 0.3592114746570587, |
| "learning_rate": 7.857050590048072e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 935417856, |
| "step": 913500 |
| }, |
| { |
| "epoch": 8.433212463439164, |
| "grad_norm": 1.0812350511550903, |
| "learning_rate": 7.833983816351576e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 935929856, |
| "step": 914000 |
| }, |
| { |
| "epoch": 8.437825818178464, |
| "grad_norm": 0.5357770919799805, |
| "learning_rate": 7.810917042655078e-06, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 936441856, |
| "step": 914500 |
| }, |
| { |
| "epoch": 8.442439172917762, |
| "grad_norm": 1.2673269510269165, |
| "learning_rate": 7.78785026895858e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 936953856, |
| "step": 915000 |
| }, |
| { |
| "epoch": 8.447052527657062, |
| "grad_norm": 1.7254928350448608, |
| "learning_rate": 7.764783495262084e-06, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 937465856, |
| "step": 915500 |
| }, |
| { |
| "epoch": 8.45166588239636, |
| "grad_norm": 0.740627646446228, |
| "learning_rate": 7.741716721565588e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 937977856, |
| "step": 916000 |
| }, |
| { |
| "epoch": 8.45627923713566, |
| "grad_norm": 0.8942471146583557, |
| "learning_rate": 7.718649947869092e-06, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 938489856, |
| "step": 916500 |
| }, |
| { |
| "epoch": 8.46089259187496, |
| "grad_norm": 0.5979003310203552, |
| "learning_rate": 7.695583174172595e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 939001856, |
| "step": 917000 |
| }, |
| { |
| "epoch": 8.46550594661426, |
| "grad_norm": 0.690619945526123, |
| "learning_rate": 7.672516400476099e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 939513856, |
| "step": 917500 |
| }, |
| { |
| "epoch": 8.470119301353558, |
| "grad_norm": 0.9563241004943848, |
| "learning_rate": 7.649449626779603e-06, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 940025856, |
| "step": 918000 |
| }, |
| { |
| "epoch": 8.474732656092858, |
| "grad_norm": 0.7812721729278564, |
| "learning_rate": 7.626382853083106e-06, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 940537856, |
| "step": 918500 |
| }, |
| { |
| "epoch": 8.479346010832156, |
| "grad_norm": 0.7864488959312439, |
| "learning_rate": 7.603316079386609e-06, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 941049856, |
| "step": 919000 |
| }, |
| { |
| "epoch": 8.483959365571456, |
| "grad_norm": 0.41324466466903687, |
| "learning_rate": 7.580249305690113e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 941561856, |
| "step": 919500 |
| }, |
| { |
| "epoch": 8.488572720310756, |
| "grad_norm": 1.0213603973388672, |
| "learning_rate": 7.557182531993616e-06, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 942073856, |
| "step": 920000 |
| }, |
| { |
| "epoch": 8.493186075050055, |
| "grad_norm": 0.9692112803459167, |
| "learning_rate": 7.53411575829712e-06, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 942585856, |
| "step": 920500 |
| }, |
| { |
| "epoch": 8.497799429789355, |
| "grad_norm": 0.9468556642532349, |
| "learning_rate": 7.511048984600621e-06, |
| "loss": 0.015, |
| "num_input_tokens_seen": 943097856, |
| "step": 921000 |
| }, |
| { |
| "epoch": 8.502412784528653, |
| "grad_norm": 1.1541293859481812, |
| "learning_rate": 7.487982210904125e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 943609856, |
| "step": 921500 |
| }, |
| { |
| "epoch": 8.507026139267953, |
| "grad_norm": 0.6092996597290039, |
| "learning_rate": 7.464915437207628e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 944121856, |
| "step": 922000 |
| }, |
| { |
| "epoch": 8.511639494007252, |
| "grad_norm": 2.1357691287994385, |
| "learning_rate": 7.441848663511132e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 944633856, |
| "step": 922500 |
| }, |
| { |
| "epoch": 8.516252848746552, |
| "grad_norm": 0.8940873146057129, |
| "learning_rate": 7.4187818898146355e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 945145856, |
| "step": 923000 |
| }, |
| { |
| "epoch": 8.52086620348585, |
| "grad_norm": 0.44890737533569336, |
| "learning_rate": 7.395715116118139e-06, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 945657856, |
| "step": 923500 |
| }, |
| { |
| "epoch": 8.52547955822515, |
| "grad_norm": 0.6357942223548889, |
| "learning_rate": 7.372648342421643e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 946169856, |
| "step": 924000 |
| }, |
| { |
| "epoch": 8.530092912964449, |
| "grad_norm": 1.20125150680542, |
| "learning_rate": 7.349581568725146e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 946681856, |
| "step": 924500 |
| }, |
| { |
| "epoch": 8.534706267703749, |
| "grad_norm": 1.3115291595458984, |
| "learning_rate": 7.32651479502865e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 947193856, |
| "step": 925000 |
| }, |
| { |
| "epoch": 8.539319622443049, |
| "grad_norm": 1.5604932308197021, |
| "learning_rate": 7.303448021332153e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 947705856, |
| "step": 925500 |
| }, |
| { |
| "epoch": 8.543932977182347, |
| "grad_norm": 0.5092642307281494, |
| "learning_rate": 7.280381247635657e-06, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 948217856, |
| "step": 926000 |
| }, |
| { |
| "epoch": 8.548546331921647, |
| "grad_norm": 0.914828896522522, |
| "learning_rate": 7.25731447393916e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 948729856, |
| "step": 926500 |
| }, |
| { |
| "epoch": 8.553159686660946, |
| "grad_norm": 0.554459810256958, |
| "learning_rate": 7.234247700242663e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 949241856, |
| "step": 927000 |
| }, |
| { |
| "epoch": 8.557773041400246, |
| "grad_norm": 0.48894843459129333, |
| "learning_rate": 7.211180926546165e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 949753856, |
| "step": 927500 |
| }, |
| { |
| "epoch": 8.562386396139544, |
| "grad_norm": 1.2641159296035767, |
| "learning_rate": 7.188114152849669e-06, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 950265856, |
| "step": 928000 |
| }, |
| { |
| "epoch": 8.566999750878844, |
| "grad_norm": 0.9658982157707214, |
| "learning_rate": 7.165047379153172e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 950777856, |
| "step": 928500 |
| }, |
| { |
| "epoch": 8.571613105618143, |
| "grad_norm": 1.2537494897842407, |
| "learning_rate": 7.141980605456676e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 951289856, |
| "step": 929000 |
| }, |
| { |
| "epoch": 8.576226460357443, |
| "grad_norm": 2.147233009338379, |
| "learning_rate": 7.1189138317601795e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 951801856, |
| "step": 929500 |
| }, |
| { |
| "epoch": 8.580839815096741, |
| "grad_norm": 1.6873968839645386, |
| "learning_rate": 7.095847058063683e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 952313856, |
| "step": 930000 |
| }, |
| { |
| "epoch": 8.585453169836041, |
| "grad_norm": 1.5905687808990479, |
| "learning_rate": 7.072780284367187e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 952825856, |
| "step": 930500 |
| }, |
| { |
| "epoch": 8.590066524575342, |
| "grad_norm": 0.8234834671020508, |
| "learning_rate": 7.04971351067069e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 953337856, |
| "step": 931000 |
| }, |
| { |
| "epoch": 8.59467987931464, |
| "grad_norm": 1.0002344846725464, |
| "learning_rate": 7.026646736974194e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 953849856, |
| "step": 931500 |
| }, |
| { |
| "epoch": 8.59929323405394, |
| "grad_norm": 4.079251289367676, |
| "learning_rate": 7.003579963277697e-06, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 954361856, |
| "step": 932000 |
| }, |
| { |
| "epoch": 8.603906588793238, |
| "grad_norm": 0.8030288815498352, |
| "learning_rate": 6.980513189581201e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 954873856, |
| "step": 932500 |
| }, |
| { |
| "epoch": 8.608519943532539, |
| "grad_norm": 0.8186569213867188, |
| "learning_rate": 6.957446415884704e-06, |
| "loss": 0.0161, |
| "num_input_tokens_seen": 955385856, |
| "step": 933000 |
| }, |
| { |
| "epoch": 8.613133298271837, |
| "grad_norm": 0.680074155330658, |
| "learning_rate": 6.934379642188207e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 955897856, |
| "step": 933500 |
| }, |
| { |
| "epoch": 8.617746653011137, |
| "grad_norm": 1.1147595643997192, |
| "learning_rate": 6.911312868491709e-06, |
| "loss": 0.0171, |
| "num_input_tokens_seen": 956409856, |
| "step": 934000 |
| }, |
| { |
| "epoch": 8.622360007750435, |
| "grad_norm": 1.0557124614715576, |
| "learning_rate": 6.888246094795213e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 956921856, |
| "step": 934500 |
| }, |
| { |
| "epoch": 8.626973362489736, |
| "grad_norm": 0.5240976214408875, |
| "learning_rate": 6.865179321098716e-06, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 957433856, |
| "step": 935000 |
| }, |
| { |
| "epoch": 8.631586717229034, |
| "grad_norm": 0.6534589529037476, |
| "learning_rate": 6.8421125474022195e-06, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 957945856, |
| "step": 935500 |
| }, |
| { |
| "epoch": 8.636200071968334, |
| "grad_norm": 0.33386147022247314, |
| "learning_rate": 6.8190457737057235e-06, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 958457856, |
| "step": 936000 |
| }, |
| { |
| "epoch": 8.640813426707634, |
| "grad_norm": 1.6744736433029175, |
| "learning_rate": 6.795979000009227e-06, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 958969856, |
| "step": 936500 |
| }, |
| { |
| "epoch": 8.645426781446933, |
| "grad_norm": 6.504983425140381, |
| "learning_rate": 6.7729122263127306e-06, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 959481856, |
| "step": 937000 |
| }, |
| { |
| "epoch": 8.650040136186233, |
| "grad_norm": 1.2921936511993408, |
| "learning_rate": 6.749845452616234e-06, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 959993856, |
| "step": 937500 |
| }, |
| { |
| "epoch": 8.654653490925531, |
| "grad_norm": 1.5937762260437012, |
| "learning_rate": 6.726778678919738e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 960505856, |
| "step": 938000 |
| }, |
| { |
| "epoch": 8.659266845664831, |
| "grad_norm": 0.9005319476127625, |
| "learning_rate": 6.703711905223241e-06, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 961017856, |
| "step": 938500 |
| }, |
| { |
| "epoch": 8.66388020040413, |
| "grad_norm": 1.019418716430664, |
| "learning_rate": 6.680645131526744e-06, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 961529856, |
| "step": 939000 |
| }, |
| { |
| "epoch": 8.66849355514343, |
| "grad_norm": 0.5105811953544617, |
| "learning_rate": 6.657578357830248e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 962041856, |
| "step": 939500 |
| }, |
| { |
| "epoch": 8.673106909882728, |
| "grad_norm": 0.6588147282600403, |
| "learning_rate": 6.634511584133751e-06, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 962553856, |
| "step": 940000 |
| }, |
| { |
| "epoch": 8.677720264622028, |
| "grad_norm": 0.5775207877159119, |
| "learning_rate": 6.611444810437253e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 963065856, |
| "step": 940500 |
| }, |
| { |
| "epoch": 8.682333619361327, |
| "grad_norm": 1.1807801723480225, |
| "learning_rate": 6.588378036740757e-06, |
| "loss": 0.015, |
| "num_input_tokens_seen": 963577856, |
| "step": 941000 |
| }, |
| { |
| "epoch": 8.686946974100627, |
| "grad_norm": 0.7394533157348633, |
| "learning_rate": 6.56531126304426e-06, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 964089856, |
| "step": 941500 |
| }, |
| { |
| "epoch": 8.691560328839925, |
| "grad_norm": 0.5393823385238647, |
| "learning_rate": 6.5422444893477635e-06, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 964601856, |
| "step": 942000 |
| }, |
| { |
| "epoch": 8.696173683579225, |
| "grad_norm": 1.1270785331726074, |
| "learning_rate": 6.5191777156512675e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 965113856, |
| "step": 942500 |
| }, |
| { |
| "epoch": 8.700787038318525, |
| "grad_norm": 1.156285047531128, |
| "learning_rate": 6.496110941954771e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 965625856, |
| "step": 943000 |
| }, |
| { |
| "epoch": 8.705400393057824, |
| "grad_norm": 0.3501507639884949, |
| "learning_rate": 6.4730441682582746e-06, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 966137856, |
| "step": 943500 |
| }, |
| { |
| "epoch": 8.710013747797124, |
| "grad_norm": 0.7830114960670471, |
| "learning_rate": 6.449977394561778e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 966649856, |
| "step": 944000 |
| }, |
| { |
| "epoch": 8.714627102536422, |
| "grad_norm": 0.9424002766609192, |
| "learning_rate": 6.426910620865282e-06, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 967161856, |
| "step": 944500 |
| }, |
| { |
| "epoch": 8.719240457275722, |
| "grad_norm": 1.7092015743255615, |
| "learning_rate": 6.403843847168785e-06, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 967673856, |
| "step": 945000 |
| }, |
| { |
| "epoch": 8.72385381201502, |
| "grad_norm": 0.3808750808238983, |
| "learning_rate": 6.380777073472288e-06, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 968185856, |
| "step": 945500 |
| }, |
| { |
| "epoch": 8.72846716675432, |
| "grad_norm": 0.8436591625213623, |
| "learning_rate": 6.357710299775792e-06, |
| "loss": 0.015, |
| "num_input_tokens_seen": 968697856, |
| "step": 946000 |
| }, |
| { |
| "epoch": 8.73308052149362, |
| "grad_norm": 0.48995792865753174, |
| "learning_rate": 6.334643526079295e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 969209856, |
| "step": 946500 |
| }, |
| { |
| "epoch": 8.73769387623292, |
| "grad_norm": 0.6074419021606445, |
| "learning_rate": 6.311576752382799e-06, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 969721856, |
| "step": 947000 |
| }, |
| { |
| "epoch": 8.742307230972218, |
| "grad_norm": 1.1008994579315186, |
| "learning_rate": 6.2885099786863e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 970233856, |
| "step": 947500 |
| }, |
| { |
| "epoch": 8.746920585711518, |
| "grad_norm": 0.4239863157272339, |
| "learning_rate": 6.265443204989804e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 970745856, |
| "step": 948000 |
| }, |
| { |
| "epoch": 8.751533940450816, |
| "grad_norm": 0.8348074555397034, |
| "learning_rate": 6.242376431293308e-06, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 971257856, |
| "step": 948500 |
| }, |
| { |
| "epoch": 8.756147295190116, |
| "grad_norm": 0.9429554343223572, |
| "learning_rate": 6.2193096575968115e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 971769856, |
| "step": 949000 |
| }, |
| { |
| "epoch": 8.760760649929416, |
| "grad_norm": 0.8379220366477966, |
| "learning_rate": 6.196242883900315e-06, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 972281856, |
| "step": 949500 |
| }, |
| { |
| "epoch": 8.765374004668715, |
| "grad_norm": 0.543300211429596, |
| "learning_rate": 6.1731761102038186e-06, |
| "loss": 0.0168, |
| "num_input_tokens_seen": 972793856, |
| "step": 950000 |
| }, |
| { |
| "epoch": 8.769987359408015, |
| "grad_norm": 1.0430985689163208, |
| "learning_rate": 6.150109336507322e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 973305856, |
| "step": 950500 |
| }, |
| { |
| "epoch": 8.774600714147313, |
| "grad_norm": 1.5497344732284546, |
| "learning_rate": 6.127042562810825e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 973817856, |
| "step": 951000 |
| }, |
| { |
| "epoch": 8.779214068886613, |
| "grad_norm": 0.5469529628753662, |
| "learning_rate": 6.103975789114329e-06, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 974329856, |
| "step": 951500 |
| }, |
| { |
| "epoch": 8.783827423625912, |
| "grad_norm": 1.1605631113052368, |
| "learning_rate": 6.080909015417832e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 974841856, |
| "step": 952000 |
| }, |
| { |
| "epoch": 8.788440778365212, |
| "grad_norm": 0.4232845604419708, |
| "learning_rate": 6.057842241721335e-06, |
| "loss": 0.015, |
| "num_input_tokens_seen": 975353856, |
| "step": 952500 |
| }, |
| { |
| "epoch": 8.79305413310451, |
| "grad_norm": 0.9222050905227661, |
| "learning_rate": 6.034775468024838e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 975865856, |
| "step": 953000 |
| }, |
| { |
| "epoch": 8.79766748784381, |
| "grad_norm": 0.6866771578788757, |
| "learning_rate": 6.011708694328342e-06, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 976377856, |
| "step": 953500 |
| }, |
| { |
| "epoch": 8.802280842583109, |
| "grad_norm": 0.7165865302085876, |
| "learning_rate": 5.988641920631845e-06, |
| "loss": 0.0153, |
| "num_input_tokens_seen": 976889856, |
| "step": 954000 |
| }, |
| { |
| "epoch": 8.806894197322409, |
| "grad_norm": 0.8396665453910828, |
| "learning_rate": 5.965575146935349e-06, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 977401856, |
| "step": 954500 |
| }, |
| { |
| "epoch": 8.811507552061709, |
| "grad_norm": 0.6975528597831726, |
| "learning_rate": 5.942508373238852e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 977913856, |
| "step": 955000 |
| }, |
| { |
| "epoch": 8.816120906801007, |
| "grad_norm": 0.8357110619544983, |
| "learning_rate": 5.9194415995423555e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 978425856, |
| "step": 955500 |
| }, |
| { |
| "epoch": 8.820734261540307, |
| "grad_norm": 0.9856480956077576, |
| "learning_rate": 5.896374825845859e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 978937856, |
| "step": 956000 |
| }, |
| { |
| "epoch": 8.825347616279606, |
| "grad_norm": 1.2731949090957642, |
| "learning_rate": 5.8733080521493626e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 979449856, |
| "step": 956500 |
| }, |
| { |
| "epoch": 8.829960971018906, |
| "grad_norm": 0.7930001020431519, |
| "learning_rate": 5.850241278452866e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 979961856, |
| "step": 957000 |
| }, |
| { |
| "epoch": 8.834574325758204, |
| "grad_norm": 0.7619320154190063, |
| "learning_rate": 5.827174504756369e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 980473856, |
| "step": 957500 |
| }, |
| { |
| "epoch": 8.839187680497504, |
| "grad_norm": 0.7133992314338684, |
| "learning_rate": 5.804107731059873e-06, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 980985856, |
| "step": 958000 |
| }, |
| { |
| "epoch": 8.843801035236803, |
| "grad_norm": 0.42310747504234314, |
| "learning_rate": 5.781040957363375e-06, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 981497856, |
| "step": 958500 |
| }, |
| { |
| "epoch": 8.848414389976103, |
| "grad_norm": 0.3348715305328369, |
| "learning_rate": 5.757974183666879e-06, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 982009856, |
| "step": 959000 |
| }, |
| { |
| "epoch": 8.853027744715401, |
| "grad_norm": 0.6126227974891663, |
| "learning_rate": 5.734907409970382e-06, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 982521856, |
| "step": 959500 |
| }, |
| { |
| "epoch": 8.857641099454701, |
| "grad_norm": 0.6455732583999634, |
| "learning_rate": 5.711840636273886e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 983033856, |
| "step": 960000 |
| }, |
| { |
| "epoch": 8.862254454194002, |
| "grad_norm": 1.075323224067688, |
| "learning_rate": 5.688773862577389e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 983545856, |
| "step": 960500 |
| }, |
| { |
| "epoch": 8.8668678089333, |
| "grad_norm": 0.8069124817848206, |
| "learning_rate": 5.665707088880893e-06, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 984057856, |
| "step": 961000 |
| }, |
| { |
| "epoch": 8.8714811636726, |
| "grad_norm": 0.9779102206230164, |
| "learning_rate": 5.642640315184396e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 984569856, |
| "step": 961500 |
| }, |
| { |
| "epoch": 8.876094518411898, |
| "grad_norm": 0.8441368937492371, |
| "learning_rate": 5.6195735414878994e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 985081856, |
| "step": 962000 |
| }, |
| { |
| "epoch": 8.880707873151199, |
| "grad_norm": 0.44055867195129395, |
| "learning_rate": 5.5965067677914026e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 985593856, |
| "step": 962500 |
| }, |
| { |
| "epoch": 8.885321227890497, |
| "grad_norm": 1.1985424757003784, |
| "learning_rate": 5.5734399940949065e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 986105856, |
| "step": 963000 |
| }, |
| { |
| "epoch": 8.889934582629797, |
| "grad_norm": 1.8032441139221191, |
| "learning_rate": 5.55037322039841e-06, |
| "loss": 0.017, |
| "num_input_tokens_seen": 986617856, |
| "step": 963500 |
| }, |
| { |
| "epoch": 8.894547937369095, |
| "grad_norm": 2.679948329925537, |
| "learning_rate": 5.527306446701913e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 987129856, |
| "step": 964000 |
| }, |
| { |
| "epoch": 8.899161292108396, |
| "grad_norm": 1.422170639038086, |
| "learning_rate": 5.504239673005417e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 987641856, |
| "step": 964500 |
| }, |
| { |
| "epoch": 8.903774646847694, |
| "grad_norm": 0.785531759262085, |
| "learning_rate": 5.481172899308919e-06, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 988153856, |
| "step": 965000 |
| }, |
| { |
| "epoch": 8.908388001586994, |
| "grad_norm": 0.813910961151123, |
| "learning_rate": 5.458106125612423e-06, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 988665856, |
| "step": 965500 |
| }, |
| { |
| "epoch": 8.913001356326294, |
| "grad_norm": 0.6769202351570129, |
| "learning_rate": 5.435039351915926e-06, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 989177856, |
| "step": 966000 |
| }, |
| { |
| "epoch": 8.917614711065593, |
| "grad_norm": 2.5310189723968506, |
| "learning_rate": 5.41197257821943e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 989689856, |
| "step": 966500 |
| }, |
| { |
| "epoch": 8.922228065804893, |
| "grad_norm": 0.5400819182395935, |
| "learning_rate": 5.388905804522933e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 990201856, |
| "step": 967000 |
| }, |
| { |
| "epoch": 8.926841420544191, |
| "grad_norm": 0.33608752489089966, |
| "learning_rate": 5.365839030826437e-06, |
| "loss": 0.0153, |
| "num_input_tokens_seen": 990713856, |
| "step": 967500 |
| }, |
| { |
| "epoch": 8.931454775283491, |
| "grad_norm": 0.6144788265228271, |
| "learning_rate": 5.34277225712994e-06, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 991225856, |
| "step": 968000 |
| }, |
| { |
| "epoch": 8.93606813002279, |
| "grad_norm": 0.8687652349472046, |
| "learning_rate": 5.3197054834334434e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 991737856, |
| "step": 968500 |
| }, |
| { |
| "epoch": 8.94068148476209, |
| "grad_norm": 0.9648618698120117, |
| "learning_rate": 5.2966387097369466e-06, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 992249856, |
| "step": 969000 |
| }, |
| { |
| "epoch": 8.945294839501388, |
| "grad_norm": 0.6023857593536377, |
| "learning_rate": 5.27357193604045e-06, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 992761856, |
| "step": 969500 |
| }, |
| { |
| "epoch": 8.949908194240688, |
| "grad_norm": 1.8448054790496826, |
| "learning_rate": 5.250505162343954e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 993273856, |
| "step": 970000 |
| }, |
| { |
| "epoch": 8.954521548979987, |
| "grad_norm": 0.6951389312744141, |
| "learning_rate": 5.227438388647457e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 993785856, |
| "step": 970500 |
| }, |
| { |
| "epoch": 8.959134903719287, |
| "grad_norm": 0.5784729719161987, |
| "learning_rate": 5.204371614950961e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 994297856, |
| "step": 971000 |
| }, |
| { |
| "epoch": 8.963748258458587, |
| "grad_norm": 1.4732640981674194, |
| "learning_rate": 5.181304841254463e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 994809856, |
| "step": 971500 |
| }, |
| { |
| "epoch": 8.968361613197885, |
| "grad_norm": 0.9267556667327881, |
| "learning_rate": 5.158238067557967e-06, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 995321856, |
| "step": 972000 |
| }, |
| { |
| "epoch": 8.972974967937185, |
| "grad_norm": 0.3285810053348541, |
| "learning_rate": 5.13517129386147e-06, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 995833856, |
| "step": 972500 |
| }, |
| { |
| "epoch": 8.977588322676484, |
| "grad_norm": 1.0577844381332397, |
| "learning_rate": 5.112104520164974e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 996345856, |
| "step": 973000 |
| }, |
| { |
| "epoch": 8.982201677415784, |
| "grad_norm": 0.40497535467147827, |
| "learning_rate": 5.089037746468477e-06, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 996857856, |
| "step": 973500 |
| }, |
| { |
| "epoch": 8.986815032155082, |
| "grad_norm": 0.6067364811897278, |
| "learning_rate": 5.065970972771981e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 997369856, |
| "step": 974000 |
| }, |
| { |
| "epoch": 8.991428386894382, |
| "grad_norm": 0.5121076703071594, |
| "learning_rate": 5.042904199075484e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 997881856, |
| "step": 974500 |
| }, |
| { |
| "epoch": 8.99604174163368, |
| "grad_norm": 1.0173983573913574, |
| "learning_rate": 5.0198374253789874e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 998393856, |
| "step": 975000 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_combined_score": 0.06468997752487994, |
| "eval_loss": 0.06468997895717621, |
| "eval_mse": 0.06468997609258367, |
| "eval_runtime": 45.8521, |
| "eval_samples_per_second": 2101.059, |
| "eval_steps_per_second": 262.649, |
| "num_input_tokens_seen": 998832384, |
| "step": 975429 |
| }, |
| { |
| "epoch": 9.00065509637298, |
| "grad_norm": 0.4236084818840027, |
| "learning_rate": 4.9967706516824906e-06, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 998905088, |
| "step": 975500 |
| }, |
| { |
| "epoch": 9.00526845111228, |
| "grad_norm": 0.6183050870895386, |
| "learning_rate": 4.973703877985994e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 999417088, |
| "step": 976000 |
| }, |
| { |
| "epoch": 9.00988180585158, |
| "grad_norm": 3.3244409561157227, |
| "learning_rate": 4.950637104289498e-06, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 999929088, |
| "step": 976500 |
| }, |
| { |
| "epoch": 9.014495160590878, |
| "grad_norm": 0.5056183934211731, |
| "learning_rate": 4.927570330593001e-06, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1000441088, |
| "step": 977000 |
| }, |
| { |
| "epoch": 9.019108515330178, |
| "grad_norm": 0.6775535941123962, |
| "learning_rate": 4.904503556896505e-06, |
| "loss": 0.0135, |
| "num_input_tokens_seen": 1000953088, |
| "step": 977500 |
| }, |
| { |
| "epoch": 9.023721870069478, |
| "grad_norm": 0.4014028012752533, |
| "learning_rate": 4.881436783200007e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1001465088, |
| "step": 978000 |
| }, |
| { |
| "epoch": 9.028335224808776, |
| "grad_norm": 0.6904358863830566, |
| "learning_rate": 4.858370009503511e-06, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1001977088, |
| "step": 978500 |
| }, |
| { |
| "epoch": 9.032948579548076, |
| "grad_norm": 1.717046856880188, |
| "learning_rate": 4.835303235807014e-06, |
| "loss": 0.0135, |
| "num_input_tokens_seen": 1002489088, |
| "step": 979000 |
| }, |
| { |
| "epoch": 9.037561934287375, |
| "grad_norm": 1.1280878782272339, |
| "learning_rate": 4.812236462110518e-06, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 1003001088, |
| "step": 979500 |
| }, |
| { |
| "epoch": 9.042175289026675, |
| "grad_norm": 0.9828783869743347, |
| "learning_rate": 4.789169688414021e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1003513088, |
| "step": 980000 |
| }, |
| { |
| "epoch": 9.046788643765973, |
| "grad_norm": 0.9039996266365051, |
| "learning_rate": 4.766102914717524e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1004025088, |
| "step": 980500 |
| }, |
| { |
| "epoch": 9.051401998505273, |
| "grad_norm": 0.760273277759552, |
| "learning_rate": 4.743036141021028e-06, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 1004537088, |
| "step": 981000 |
| }, |
| { |
| "epoch": 9.056015353244572, |
| "grad_norm": 0.6820119619369507, |
| "learning_rate": 4.719969367324531e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 1005049088, |
| "step": 981500 |
| }, |
| { |
| "epoch": 9.060628707983872, |
| "grad_norm": 0.8274890780448914, |
| "learning_rate": 4.6969025936280346e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1005561088, |
| "step": 982000 |
| }, |
| { |
| "epoch": 9.06524206272317, |
| "grad_norm": 0.43844661116600037, |
| "learning_rate": 4.673835819931538e-06, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1006073088, |
| "step": 982500 |
| }, |
| { |
| "epoch": 9.06985541746247, |
| "grad_norm": 1.0397804975509644, |
| "learning_rate": 4.650769046235042e-06, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 1006585088, |
| "step": 983000 |
| }, |
| { |
| "epoch": 9.07446877220177, |
| "grad_norm": 1.1185849905014038, |
| "learning_rate": 4.627702272538545e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1007097088, |
| "step": 983500 |
| }, |
| { |
| "epoch": 9.079082126941069, |
| "grad_norm": 0.4616248607635498, |
| "learning_rate": 4.604635498842049e-06, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 1007609088, |
| "step": 984000 |
| }, |
| { |
| "epoch": 9.083695481680369, |
| "grad_norm": 0.4887053966522217, |
| "learning_rate": 4.581568725145551e-06, |
| "loss": 0.0119, |
| "num_input_tokens_seen": 1008121088, |
| "step": 984500 |
| }, |
| { |
| "epoch": 9.088308836419667, |
| "grad_norm": 0.9657731056213379, |
| "learning_rate": 4.558501951449055e-06, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 1008633088, |
| "step": 985000 |
| }, |
| { |
| "epoch": 9.092922191158967, |
| "grad_norm": 0.6589749455451965, |
| "learning_rate": 4.535435177752558e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1009145088, |
| "step": 985500 |
| }, |
| { |
| "epoch": 9.097535545898266, |
| "grad_norm": 1.095737338066101, |
| "learning_rate": 4.512368404056062e-06, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 1009657088, |
| "step": 986000 |
| }, |
| { |
| "epoch": 9.102148900637566, |
| "grad_norm": 0.9578360915184021, |
| "learning_rate": 4.489301630359565e-06, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 1010169088, |
| "step": 986500 |
| }, |
| { |
| "epoch": 9.106762255376864, |
| "grad_norm": 1.0494704246520996, |
| "learning_rate": 4.466234856663068e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 1010681088, |
| "step": 987000 |
| }, |
| { |
| "epoch": 9.111375610116164, |
| "grad_norm": 0.3351483643054962, |
| "learning_rate": 4.443168082966572e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 1011193088, |
| "step": 987500 |
| }, |
| { |
| "epoch": 9.115988964855463, |
| "grad_norm": 1.107553482055664, |
| "learning_rate": 4.4201013092700746e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 1011705088, |
| "step": 988000 |
| }, |
| { |
| "epoch": 9.120602319594763, |
| "grad_norm": 0.8427937626838684, |
| "learning_rate": 4.3970345355735785e-06, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 1012217088, |
| "step": 988500 |
| }, |
| { |
| "epoch": 9.125215674334063, |
| "grad_norm": 0.5374360084533691, |
| "learning_rate": 4.373967761877082e-06, |
| "loss": 0.0121, |
| "num_input_tokens_seen": 1012729088, |
| "step": 989000 |
| }, |
| { |
| "epoch": 9.129829029073361, |
| "grad_norm": 1.2801436185836792, |
| "learning_rate": 4.350900988180586e-06, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 1013241088, |
| "step": 989500 |
| }, |
| { |
| "epoch": 9.134442383812662, |
| "grad_norm": 2.0048415660858154, |
| "learning_rate": 4.327834214484089e-06, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1013753088, |
| "step": 990000 |
| }, |
| { |
| "epoch": 9.13905573855196, |
| "grad_norm": 1.3461086750030518, |
| "learning_rate": 4.304767440787593e-06, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 1014265088, |
| "step": 990500 |
| }, |
| { |
| "epoch": 9.14366909329126, |
| "grad_norm": 0.5770676732063293, |
| "learning_rate": 4.281700667091096e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 1014777088, |
| "step": 991000 |
| }, |
| { |
| "epoch": 9.148282448030558, |
| "grad_norm": 0.7648055553436279, |
| "learning_rate": 4.258633893394599e-06, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 1015289088, |
| "step": 991500 |
| }, |
| { |
| "epoch": 9.152895802769859, |
| "grad_norm": 0.8219977021217346, |
| "learning_rate": 4.235567119698102e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1015801088, |
| "step": 992000 |
| }, |
| { |
| "epoch": 9.157509157509157, |
| "grad_norm": 0.2618965804576874, |
| "learning_rate": 4.212500346001605e-06, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 1016313088, |
| "step": 992500 |
| }, |
| { |
| "epoch": 9.162122512248457, |
| "grad_norm": 0.580898642539978, |
| "learning_rate": 4.189433572305109e-06, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 1016825088, |
| "step": 993000 |
| }, |
| { |
| "epoch": 9.166735866987755, |
| "grad_norm": 1.426604151725769, |
| "learning_rate": 4.166366798608612e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1017337088, |
| "step": 993500 |
| }, |
| { |
| "epoch": 9.171349221727056, |
| "grad_norm": 0.4607691764831543, |
| "learning_rate": 4.143300024912116e-06, |
| "loss": 0.0126, |
| "num_input_tokens_seen": 1017849088, |
| "step": 994000 |
| }, |
| { |
| "epoch": 9.175962576466354, |
| "grad_norm": 0.5528801083564758, |
| "learning_rate": 4.1202332512156186e-06, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1018361088, |
| "step": 994500 |
| }, |
| { |
| "epoch": 9.180575931205654, |
| "grad_norm": 0.24360989034175873, |
| "learning_rate": 4.0971664775191225e-06, |
| "loss": 0.0117, |
| "num_input_tokens_seen": 1018873088, |
| "step": 995000 |
| }, |
| { |
| "epoch": 9.185189285944954, |
| "grad_norm": 0.5846107602119446, |
| "learning_rate": 4.074099703822626e-06, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 1019385088, |
| "step": 995500 |
| }, |
| { |
| "epoch": 9.189802640684253, |
| "grad_norm": 0.8627530932426453, |
| "learning_rate": 4.05103293012613e-06, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 1019897088, |
| "step": 996000 |
| }, |
| { |
| "epoch": 9.194415995423553, |
| "grad_norm": 0.7435634732246399, |
| "learning_rate": 4.027966156429633e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1020409088, |
| "step": 996500 |
| }, |
| { |
| "epoch": 9.199029350162851, |
| "grad_norm": 0.6394104957580566, |
| "learning_rate": 4.004899382733137e-06, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 1020921088, |
| "step": 997000 |
| }, |
| { |
| "epoch": 9.203642704902151, |
| "grad_norm": 0.4735194444656372, |
| "learning_rate": 3.98183260903664e-06, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1021433088, |
| "step": 997500 |
| }, |
| { |
| "epoch": 9.20825605964145, |
| "grad_norm": 0.9603920578956604, |
| "learning_rate": 3.958765835340143e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1021945088, |
| "step": 998000 |
| }, |
| { |
| "epoch": 9.21286941438075, |
| "grad_norm": 1.0817182064056396, |
| "learning_rate": 3.935699061643646e-06, |
| "loss": 0.0117, |
| "num_input_tokens_seen": 1022457088, |
| "step": 998500 |
| }, |
| { |
| "epoch": 9.217482769120048, |
| "grad_norm": 0.5785081386566162, |
| "learning_rate": 3.912632287947149e-06, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 1022969088, |
| "step": 999000 |
| }, |
| { |
| "epoch": 9.222096123859348, |
| "grad_norm": 0.34806227684020996, |
| "learning_rate": 3.889565514250653e-06, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 1023481088, |
| "step": 999500 |
| }, |
| { |
| "epoch": 9.226709478598647, |
| "grad_norm": 0.8392277359962463, |
| "learning_rate": 3.866498740554156e-06, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1023993088, |
| "step": 1000000 |
| }, |
| { |
| "epoch": 9.231322833337947, |
| "grad_norm": 0.34862348437309265, |
| "learning_rate": 3.84343196685766e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 1024505088, |
| "step": 1000500 |
| }, |
| { |
| "epoch": 9.235936188077247, |
| "grad_norm": 0.8864858150482178, |
| "learning_rate": 3.8203651931611626e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1025017088, |
| "step": 1001000 |
| }, |
| { |
| "epoch": 9.240549542816545, |
| "grad_norm": 0.7740064263343811, |
| "learning_rate": 3.797298419464666e-06, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1025529088, |
| "step": 1001500 |
| }, |
| { |
| "epoch": 9.245162897555845, |
| "grad_norm": 0.21236860752105713, |
| "learning_rate": 3.7742316457681697e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1026041088, |
| "step": 1002000 |
| }, |
| { |
| "epoch": 9.249776252295144, |
| "grad_norm": 0.5248683094978333, |
| "learning_rate": 3.751164872071673e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1026553088, |
| "step": 1002500 |
| }, |
| { |
| "epoch": 9.254389607034444, |
| "grad_norm": 0.49671700596809387, |
| "learning_rate": 3.7280980983751767e-06, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 1027065088, |
| "step": 1003000 |
| }, |
| { |
| "epoch": 9.259002961773742, |
| "grad_norm": 0.7748130559921265, |
| "learning_rate": 3.7050313246786803e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1027577088, |
| "step": 1003500 |
| }, |
| { |
| "epoch": 9.263616316513042, |
| "grad_norm": 0.5696319341659546, |
| "learning_rate": 3.681964550982184e-06, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1028089088, |
| "step": 1004000 |
| }, |
| { |
| "epoch": 9.26822967125234, |
| "grad_norm": 1.47969651222229, |
| "learning_rate": 3.6588977772856865e-06, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 1028601088, |
| "step": 1004500 |
| }, |
| { |
| "epoch": 9.27284302599164, |
| "grad_norm": 0.6833159923553467, |
| "learning_rate": 3.63583100358919e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 1029113088, |
| "step": 1005000 |
| }, |
| { |
| "epoch": 9.27745638073094, |
| "grad_norm": 0.9838703870773315, |
| "learning_rate": 3.6127642298926936e-06, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 1029625088, |
| "step": 1005500 |
| }, |
| { |
| "epoch": 9.28206973547024, |
| "grad_norm": 0.5185501575469971, |
| "learning_rate": 3.589697456196197e-06, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 1030137088, |
| "step": 1006000 |
| }, |
| { |
| "epoch": 9.28668309020954, |
| "grad_norm": 0.6044150590896606, |
| "learning_rate": 3.5666306824997003e-06, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1030649088, |
| "step": 1006500 |
| }, |
| { |
| "epoch": 9.291296444948838, |
| "grad_norm": 0.5589469075202942, |
| "learning_rate": 3.543563908803204e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1031161088, |
| "step": 1007000 |
| }, |
| { |
| "epoch": 9.295909799688138, |
| "grad_norm": 0.8428828120231628, |
| "learning_rate": 3.5204971351067066e-06, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1031673088, |
| "step": 1007500 |
| }, |
| { |
| "epoch": 9.300523154427436, |
| "grad_norm": 1.0949701070785522, |
| "learning_rate": 3.49743036141021e-06, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1032185088, |
| "step": 1008000 |
| }, |
| { |
| "epoch": 9.305136509166736, |
| "grad_norm": 0.48161888122558594, |
| "learning_rate": 3.4743635877137136e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 1032697088, |
| "step": 1008500 |
| }, |
| { |
| "epoch": 9.309749863906035, |
| "grad_norm": 1.4229580163955688, |
| "learning_rate": 3.451296814017217e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1033209088, |
| "step": 1009000 |
| }, |
| { |
| "epoch": 9.314363218645335, |
| "grad_norm": 1.3797547817230225, |
| "learning_rate": 3.4282300403207207e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 1033721088, |
| "step": 1009500 |
| }, |
| { |
| "epoch": 9.318976573384633, |
| "grad_norm": 0.764750599861145, |
| "learning_rate": 3.4051632666242243e-06, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 1034233088, |
| "step": 1010000 |
| }, |
| { |
| "epoch": 9.323589928123933, |
| "grad_norm": 1.4155054092407227, |
| "learning_rate": 3.382096492927728e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 1034745088, |
| "step": 1010500 |
| }, |
| { |
| "epoch": 9.328203282863232, |
| "grad_norm": 0.5639691352844238, |
| "learning_rate": 3.3590297192312305e-06, |
| "loss": 0.0123, |
| "num_input_tokens_seen": 1035257088, |
| "step": 1011000 |
| }, |
| { |
| "epoch": 9.332816637602532, |
| "grad_norm": 1.6954376697540283, |
| "learning_rate": 3.335962945534734e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 1035769088, |
| "step": 1011500 |
| }, |
| { |
| "epoch": 9.337429992341832, |
| "grad_norm": 1.096420168876648, |
| "learning_rate": 3.3128961718382376e-06, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 1036281088, |
| "step": 1012000 |
| }, |
| { |
| "epoch": 9.34204334708113, |
| "grad_norm": 0.7063207626342773, |
| "learning_rate": 3.2898293981417408e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 1036793088, |
| "step": 1012500 |
| }, |
| { |
| "epoch": 9.34665670182043, |
| "grad_norm": 1.40740966796875, |
| "learning_rate": 3.2667626244452443e-06, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1037305088, |
| "step": 1013000 |
| }, |
| { |
| "epoch": 9.351270056559729, |
| "grad_norm": 1.0713701248168945, |
| "learning_rate": 3.243695850748748e-06, |
| "loss": 0.0122, |
| "num_input_tokens_seen": 1037817088, |
| "step": 1013500 |
| }, |
| { |
| "epoch": 9.355883411299029, |
| "grad_norm": 0.41992899775505066, |
| "learning_rate": 3.2206290770522505e-06, |
| "loss": 0.0114, |
| "num_input_tokens_seen": 1038329088, |
| "step": 1014000 |
| }, |
| { |
| "epoch": 9.360496766038327, |
| "grad_norm": 0.42630577087402344, |
| "learning_rate": 3.197562303355754e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 1038841088, |
| "step": 1014500 |
| }, |
| { |
| "epoch": 9.365110120777628, |
| "grad_norm": 1.1027462482452393, |
| "learning_rate": 3.1744955296592576e-06, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 1039353088, |
| "step": 1015000 |
| }, |
| { |
| "epoch": 9.369723475516926, |
| "grad_norm": 0.5520905256271362, |
| "learning_rate": 3.151428755962761e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1039865088, |
| "step": 1015500 |
| }, |
| { |
| "epoch": 9.374336830256226, |
| "grad_norm": 0.46760430932044983, |
| "learning_rate": 3.1283619822662647e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 1040377088, |
| "step": 1016000 |
| }, |
| { |
| "epoch": 9.378950184995524, |
| "grad_norm": 0.5815434455871582, |
| "learning_rate": 3.105295208569768e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 1040889088, |
| "step": 1016500 |
| }, |
| { |
| "epoch": 9.383563539734824, |
| "grad_norm": 1.3620293140411377, |
| "learning_rate": 3.0822284348732714e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1041401088, |
| "step": 1017000 |
| }, |
| { |
| "epoch": 9.388176894474123, |
| "grad_norm": 0.8543253540992737, |
| "learning_rate": 3.059161661176775e-06, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1041913088, |
| "step": 1017500 |
| }, |
| { |
| "epoch": 9.392790249213423, |
| "grad_norm": 1.2159240245819092, |
| "learning_rate": 3.036094887480278e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 1042425088, |
| "step": 1018000 |
| }, |
| { |
| "epoch": 9.397403603952723, |
| "grad_norm": 0.7059375643730164, |
| "learning_rate": 3.013028113783781e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1042937088, |
| "step": 1018500 |
| }, |
| { |
| "epoch": 9.402016958692021, |
| "grad_norm": 0.45824775099754333, |
| "learning_rate": 2.9899613400872847e-06, |
| "loss": 0.0135, |
| "num_input_tokens_seen": 1043449088, |
| "step": 1019000 |
| }, |
| { |
| "epoch": 9.406630313431322, |
| "grad_norm": 0.6606787443161011, |
| "learning_rate": 2.9668945663907883e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 1043961088, |
| "step": 1019500 |
| }, |
| { |
| "epoch": 9.41124366817062, |
| "grad_norm": 0.8153837323188782, |
| "learning_rate": 2.9438277926942914e-06, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1044473088, |
| "step": 1020000 |
| }, |
| { |
| "epoch": 9.41585702290992, |
| "grad_norm": 0.4770793318748474, |
| "learning_rate": 2.920761018997795e-06, |
| "loss": 0.0135, |
| "num_input_tokens_seen": 1044985088, |
| "step": 1020500 |
| }, |
| { |
| "epoch": 9.420470377649218, |
| "grad_norm": 1.226976990699768, |
| "learning_rate": 2.8976942453012985e-06, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1045497088, |
| "step": 1021000 |
| }, |
| { |
| "epoch": 9.425083732388519, |
| "grad_norm": 0.3825905919075012, |
| "learning_rate": 2.8746274716048016e-06, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1046009088, |
| "step": 1021500 |
| }, |
| { |
| "epoch": 9.429697087127817, |
| "grad_norm": 0.6580853462219238, |
| "learning_rate": 2.851560697908305e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 1046521088, |
| "step": 1022000 |
| }, |
| { |
| "epoch": 9.434310441867117, |
| "grad_norm": 1.0704902410507202, |
| "learning_rate": 2.8284939242118087e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1047033088, |
| "step": 1022500 |
| }, |
| { |
| "epoch": 9.438923796606415, |
| "grad_norm": 1.5487003326416016, |
| "learning_rate": 2.805427150515312e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 1047545088, |
| "step": 1023000 |
| }, |
| { |
| "epoch": 9.443537151345716, |
| "grad_norm": 0.4171670079231262, |
| "learning_rate": 2.7823603768188154e-06, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 1048057088, |
| "step": 1023500 |
| }, |
| { |
| "epoch": 9.448150506085016, |
| "grad_norm": 1.1898133754730225, |
| "learning_rate": 2.7592936031223185e-06, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 1048569088, |
| "step": 1024000 |
| }, |
| { |
| "epoch": 9.452763860824314, |
| "grad_norm": 0.4748603105545044, |
| "learning_rate": 2.7362268294258216e-06, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1049081088, |
| "step": 1024500 |
| }, |
| { |
| "epoch": 9.457377215563614, |
| "grad_norm": 1.6988264322280884, |
| "learning_rate": 2.713160055729325e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1049593088, |
| "step": 1025000 |
| }, |
| { |
| "epoch": 9.461990570302913, |
| "grad_norm": 1.1586196422576904, |
| "learning_rate": 2.6900932820328287e-06, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 1050105088, |
| "step": 1025500 |
| }, |
| { |
| "epoch": 9.466603925042213, |
| "grad_norm": 1.3323612213134766, |
| "learning_rate": 2.6670265083363323e-06, |
| "loss": 0.0117, |
| "num_input_tokens_seen": 1050617088, |
| "step": 1026000 |
| }, |
| { |
| "epoch": 9.471217279781511, |
| "grad_norm": 0.6006079316139221, |
| "learning_rate": 2.6439597346398354e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 1051129088, |
| "step": 1026500 |
| }, |
| { |
| "epoch": 9.475830634520811, |
| "grad_norm": 0.9578723907470703, |
| "learning_rate": 2.620892960943339e-06, |
| "loss": 0.0122, |
| "num_input_tokens_seen": 1051641088, |
| "step": 1027000 |
| }, |
| { |
| "epoch": 9.48044398926011, |
| "grad_norm": 0.9589295983314514, |
| "learning_rate": 2.5978261872468425e-06, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 1052153088, |
| "step": 1027500 |
| }, |
| { |
| "epoch": 9.48505734399941, |
| "grad_norm": 1.320854663848877, |
| "learning_rate": 2.5747594135503456e-06, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 1052665088, |
| "step": 1028000 |
| }, |
| { |
| "epoch": 9.489670698738708, |
| "grad_norm": 0.5850228071212769, |
| "learning_rate": 2.551692639853849e-06, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 1053177088, |
| "step": 1028500 |
| }, |
| { |
| "epoch": 9.494284053478008, |
| "grad_norm": 0.4947618544101715, |
| "learning_rate": 2.5286258661573527e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1053689088, |
| "step": 1029000 |
| }, |
| { |
| "epoch": 9.498897408217307, |
| "grad_norm": 1.5554652214050293, |
| "learning_rate": 2.505559092460856e-06, |
| "loss": 0.0114, |
| "num_input_tokens_seen": 1054201088, |
| "step": 1029500 |
| }, |
| { |
| "epoch": 9.503510762956607, |
| "grad_norm": 0.7134987711906433, |
| "learning_rate": 2.482492318764359e-06, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 1054713088, |
| "step": 1030000 |
| }, |
| { |
| "epoch": 9.508124117695907, |
| "grad_norm": 0.6300977468490601, |
| "learning_rate": 2.4594255450678625e-06, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1055225088, |
| "step": 1030500 |
| }, |
| { |
| "epoch": 9.512737472435205, |
| "grad_norm": 0.30723100900650024, |
| "learning_rate": 2.4363587713713656e-06, |
| "loss": 0.012, |
| "num_input_tokens_seen": 1055737088, |
| "step": 1031000 |
| }, |
| { |
| "epoch": 9.517350827174505, |
| "grad_norm": 0.5518991947174072, |
| "learning_rate": 2.413291997674869e-06, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1056249088, |
| "step": 1031500 |
| }, |
| { |
| "epoch": 9.521964181913804, |
| "grad_norm": 0.48715853691101074, |
| "learning_rate": 2.3902252239783727e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 1056761088, |
| "step": 1032000 |
| }, |
| { |
| "epoch": 9.526577536653104, |
| "grad_norm": 0.9060729742050171, |
| "learning_rate": 2.3671584502818763e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 1057273088, |
| "step": 1032500 |
| }, |
| { |
| "epoch": 9.531190891392402, |
| "grad_norm": 0.6399810910224915, |
| "learning_rate": 2.3440916765853794e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 1057785088, |
| "step": 1033000 |
| }, |
| { |
| "epoch": 9.535804246131702, |
| "grad_norm": 0.8663894534111023, |
| "learning_rate": 2.321024902888883e-06, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1058297088, |
| "step": 1033500 |
| }, |
| { |
| "epoch": 9.540417600871, |
| "grad_norm": 1.554218053817749, |
| "learning_rate": 2.2979581291923865e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1058809088, |
| "step": 1034000 |
| }, |
| { |
| "epoch": 9.5450309556103, |
| "grad_norm": 0.5967795848846436, |
| "learning_rate": 2.2748913554958896e-06, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 1059321088, |
| "step": 1034500 |
| }, |
| { |
| "epoch": 9.5496443103496, |
| "grad_norm": 0.7761898040771484, |
| "learning_rate": 2.251824581799393e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1059833088, |
| "step": 1035000 |
| }, |
| { |
| "epoch": 9.5542576650889, |
| "grad_norm": 0.4565838873386383, |
| "learning_rate": 2.2287578081028963e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1060345088, |
| "step": 1035500 |
| }, |
| { |
| "epoch": 9.5588710198282, |
| "grad_norm": 1.4918292760849, |
| "learning_rate": 2.2056910344063994e-06, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 1060857088, |
| "step": 1036000 |
| }, |
| { |
| "epoch": 9.563484374567498, |
| "grad_norm": 1.143227458000183, |
| "learning_rate": 2.182624260709903e-06, |
| "loss": 0.0126, |
| "num_input_tokens_seen": 1061369088, |
| "step": 1036500 |
| }, |
| { |
| "epoch": 9.568097729306798, |
| "grad_norm": 0.4711507558822632, |
| "learning_rate": 2.1595574870134065e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1061881088, |
| "step": 1037000 |
| }, |
| { |
| "epoch": 9.572711084046096, |
| "grad_norm": 1.8225018978118896, |
| "learning_rate": 2.1364907133169096e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 1062393088, |
| "step": 1037500 |
| }, |
| { |
| "epoch": 9.577324438785396, |
| "grad_norm": 1.6516982316970825, |
| "learning_rate": 2.113423939620413e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1062905088, |
| "step": 1038000 |
| }, |
| { |
| "epoch": 9.581937793524695, |
| "grad_norm": 0.6592885255813599, |
| "learning_rate": 2.0903571659239167e-06, |
| "loss": 0.0135, |
| "num_input_tokens_seen": 1063417088, |
| "step": 1038500 |
| }, |
| { |
| "epoch": 9.586551148263995, |
| "grad_norm": 0.9162536263465881, |
| "learning_rate": 2.0672903922274203e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 1063929088, |
| "step": 1039000 |
| }, |
| { |
| "epoch": 9.591164503003293, |
| "grad_norm": 1.3136478662490845, |
| "learning_rate": 2.0442236185309234e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 1064441088, |
| "step": 1039500 |
| }, |
| { |
| "epoch": 9.595777857742593, |
| "grad_norm": 0.8929975032806396, |
| "learning_rate": 2.021156844834427e-06, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 1064953088, |
| "step": 1040000 |
| }, |
| { |
| "epoch": 9.600391212481892, |
| "grad_norm": 0.6862032413482666, |
| "learning_rate": 1.9980900711379305e-06, |
| "loss": 0.0116, |
| "num_input_tokens_seen": 1065465088, |
| "step": 1040500 |
| }, |
| { |
| "epoch": 9.605004567221192, |
| "grad_norm": 1.4420340061187744, |
| "learning_rate": 1.9750232974414336e-06, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 1065977088, |
| "step": 1041000 |
| }, |
| { |
| "epoch": 9.609617921960492, |
| "grad_norm": 0.6107918620109558, |
| "learning_rate": 1.9519565237449367e-06, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1066489088, |
| "step": 1041500 |
| }, |
| { |
| "epoch": 9.61423127669979, |
| "grad_norm": 0.8065725564956665, |
| "learning_rate": 1.9288897500484403e-06, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 1067001088, |
| "step": 1042000 |
| }, |
| { |
| "epoch": 9.61884463143909, |
| "grad_norm": 1.1736738681793213, |
| "learning_rate": 1.9058229763519436e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 1067513088, |
| "step": 1042500 |
| }, |
| { |
| "epoch": 9.623457986178389, |
| "grad_norm": 3.729763984680176, |
| "learning_rate": 1.882756202655447e-06, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 1068025088, |
| "step": 1043000 |
| }, |
| { |
| "epoch": 9.628071340917689, |
| "grad_norm": 0.39236801862716675, |
| "learning_rate": 1.8596894289589505e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 1068537088, |
| "step": 1043500 |
| }, |
| { |
| "epoch": 9.632684695656987, |
| "grad_norm": 1.0780402421951294, |
| "learning_rate": 1.8366226552624536e-06, |
| "loss": 0.0112, |
| "num_input_tokens_seen": 1069049088, |
| "step": 1044000 |
| }, |
| { |
| "epoch": 9.637298050396288, |
| "grad_norm": 0.5110656023025513, |
| "learning_rate": 1.8135558815659572e-06, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 1069561088, |
| "step": 1044500 |
| }, |
| { |
| "epoch": 9.641911405135586, |
| "grad_norm": 0.23593804240226746, |
| "learning_rate": 1.7904891078694607e-06, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 1070073088, |
| "step": 1045000 |
| }, |
| { |
| "epoch": 9.646524759874886, |
| "grad_norm": 0.9505711793899536, |
| "learning_rate": 1.767422334172964e-06, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1070585088, |
| "step": 1045500 |
| }, |
| { |
| "epoch": 9.651138114614184, |
| "grad_norm": 0.9649909138679504, |
| "learning_rate": 1.7443555604764672e-06, |
| "loss": 0.0153, |
| "num_input_tokens_seen": 1071097088, |
| "step": 1046000 |
| }, |
| { |
| "epoch": 9.655751469353484, |
| "grad_norm": 0.29947414994239807, |
| "learning_rate": 1.7212887867799707e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 1071609088, |
| "step": 1046500 |
| }, |
| { |
| "epoch": 9.660364824092785, |
| "grad_norm": 0.9218162298202515, |
| "learning_rate": 1.6982220130834743e-06, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 1072121088, |
| "step": 1047000 |
| }, |
| { |
| "epoch": 9.664978178832083, |
| "grad_norm": 1.3005330562591553, |
| "learning_rate": 1.6751552393869774e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 1072633088, |
| "step": 1047500 |
| }, |
| { |
| "epoch": 9.669591533571383, |
| "grad_norm": 1.300002932548523, |
| "learning_rate": 1.652088465690481e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1073145088, |
| "step": 1048000 |
| }, |
| { |
| "epoch": 9.674204888310681, |
| "grad_norm": 0.6326736211776733, |
| "learning_rate": 1.6290216919939843e-06, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 1073657088, |
| "step": 1048500 |
| }, |
| { |
| "epoch": 9.678818243049982, |
| "grad_norm": 0.865162193775177, |
| "learning_rate": 1.6059549182974874e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1074169088, |
| "step": 1049000 |
| }, |
| { |
| "epoch": 9.68343159778928, |
| "grad_norm": 0.6226495504379272, |
| "learning_rate": 1.582888144600991e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1074681088, |
| "step": 1049500 |
| }, |
| { |
| "epoch": 9.68804495252858, |
| "grad_norm": 1.6454648971557617, |
| "learning_rate": 1.5598213709044945e-06, |
| "loss": 0.012, |
| "num_input_tokens_seen": 1075193088, |
| "step": 1050000 |
| }, |
| { |
| "epoch": 9.692658307267878, |
| "grad_norm": 0.4671117663383484, |
| "learning_rate": 1.5367545972079978e-06, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1075705088, |
| "step": 1050500 |
| }, |
| { |
| "epoch": 9.697271662007179, |
| "grad_norm": 0.9937256574630737, |
| "learning_rate": 1.5136878235115012e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 1076217088, |
| "step": 1051000 |
| }, |
| { |
| "epoch": 9.701885016746477, |
| "grad_norm": 0.976679265499115, |
| "learning_rate": 1.4906210498150045e-06, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 1076729088, |
| "step": 1051500 |
| }, |
| { |
| "epoch": 9.706498371485777, |
| "grad_norm": 0.5003361701965332, |
| "learning_rate": 1.4675542761185078e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1077241088, |
| "step": 1052000 |
| }, |
| { |
| "epoch": 9.711111726225077, |
| "grad_norm": 0.7003839015960693, |
| "learning_rate": 1.4444875024220114e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1077753088, |
| "step": 1052500 |
| }, |
| { |
| "epoch": 9.715725080964376, |
| "grad_norm": 0.6862497925758362, |
| "learning_rate": 1.4214207287255147e-06, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1078265088, |
| "step": 1053000 |
| }, |
| { |
| "epoch": 9.720338435703676, |
| "grad_norm": 0.26981067657470703, |
| "learning_rate": 1.398353955029018e-06, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 1078777088, |
| "step": 1053500 |
| }, |
| { |
| "epoch": 9.724951790442974, |
| "grad_norm": 0.6135255694389343, |
| "learning_rate": 1.3752871813325216e-06, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1079289088, |
| "step": 1054000 |
| }, |
| { |
| "epoch": 9.729565145182274, |
| "grad_norm": 0.6279376149177551, |
| "learning_rate": 1.3522204076360247e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 1079801088, |
| "step": 1054500 |
| }, |
| { |
| "epoch": 9.734178499921573, |
| "grad_norm": 1.5329886674880981, |
| "learning_rate": 1.329153633939528e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 1080313088, |
| "step": 1055000 |
| }, |
| { |
| "epoch": 9.738791854660873, |
| "grad_norm": 1.2570598125457764, |
| "learning_rate": 1.3060868602430316e-06, |
| "loss": 0.0126, |
| "num_input_tokens_seen": 1080825088, |
| "step": 1055500 |
| }, |
| { |
| "epoch": 9.743405209400171, |
| "grad_norm": 1.8935927152633667, |
| "learning_rate": 1.283020086546535e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 1081337088, |
| "step": 1056000 |
| }, |
| { |
| "epoch": 9.748018564139471, |
| "grad_norm": 0.5364086031913757, |
| "learning_rate": 1.2599533128500385e-06, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 1081849088, |
| "step": 1056500 |
| }, |
| { |
| "epoch": 9.75263191887877, |
| "grad_norm": 0.6562399864196777, |
| "learning_rate": 1.2368865391535418e-06, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1082361088, |
| "step": 1057000 |
| }, |
| { |
| "epoch": 9.75724527361807, |
| "grad_norm": 0.7584030628204346, |
| "learning_rate": 1.213819765457045e-06, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1082873088, |
| "step": 1057500 |
| }, |
| { |
| "epoch": 9.76185862835737, |
| "grad_norm": 0.8746394515037537, |
| "learning_rate": 1.1907529917605485e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 1083385088, |
| "step": 1058000 |
| }, |
| { |
| "epoch": 9.766471983096668, |
| "grad_norm": 1.1132066249847412, |
| "learning_rate": 1.1676862180640518e-06, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 1083897088, |
| "step": 1058500 |
| }, |
| { |
| "epoch": 9.771085337835968, |
| "grad_norm": 0.7786855697631836, |
| "learning_rate": 1.1446194443675554e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1084409088, |
| "step": 1059000 |
| }, |
| { |
| "epoch": 9.775698692575267, |
| "grad_norm": 0.5935215353965759, |
| "learning_rate": 1.1215526706710587e-06, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 1084921088, |
| "step": 1059500 |
| }, |
| { |
| "epoch": 9.780312047314567, |
| "grad_norm": 1.0187913179397583, |
| "learning_rate": 1.098485896974562e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 1085433088, |
| "step": 1060000 |
| }, |
| { |
| "epoch": 9.784925402053865, |
| "grad_norm": 0.6144331693649292, |
| "learning_rate": 1.0754191232780654e-06, |
| "loss": 0.0123, |
| "num_input_tokens_seen": 1085945088, |
| "step": 1060500 |
| }, |
| { |
| "epoch": 9.789538756793165, |
| "grad_norm": 0.6357366442680359, |
| "learning_rate": 1.0523523495815687e-06, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 1086457088, |
| "step": 1061000 |
| }, |
| { |
| "epoch": 9.794152111532464, |
| "grad_norm": 8.163220405578613, |
| "learning_rate": 1.0292855758850723e-06, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 1086969088, |
| "step": 1061500 |
| }, |
| { |
| "epoch": 9.798765466271764, |
| "grad_norm": 1.1560457944869995, |
| "learning_rate": 1.0062188021885756e-06, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 1087481088, |
| "step": 1062000 |
| }, |
| { |
| "epoch": 9.803378821011062, |
| "grad_norm": 1.6285614967346191, |
| "learning_rate": 9.83152028492079e-07, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 1087993088, |
| "step": 1062500 |
| }, |
| { |
| "epoch": 9.807992175750362, |
| "grad_norm": 0.9213132858276367, |
| "learning_rate": 9.600852547955823e-07, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1088505088, |
| "step": 1063000 |
| }, |
| { |
| "epoch": 9.81260553048966, |
| "grad_norm": 0.7964446544647217, |
| "learning_rate": 9.370184810990857e-07, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1089017088, |
| "step": 1063500 |
| }, |
| { |
| "epoch": 9.81721888522896, |
| "grad_norm": 0.8223236799240112, |
| "learning_rate": 9.139517074025889e-07, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1089529088, |
| "step": 1064000 |
| }, |
| { |
| "epoch": 9.821832239968261, |
| "grad_norm": 0.9797717332839966, |
| "learning_rate": 8.908849337060925e-07, |
| "loss": 0.0126, |
| "num_input_tokens_seen": 1090041088, |
| "step": 1064500 |
| }, |
| { |
| "epoch": 9.82644559470756, |
| "grad_norm": 0.23104320466518402, |
| "learning_rate": 8.678181600095958e-07, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 1090553088, |
| "step": 1065000 |
| }, |
| { |
| "epoch": 9.83105894944686, |
| "grad_norm": 0.5735734105110168, |
| "learning_rate": 8.447513863130993e-07, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 1091065088, |
| "step": 1065500 |
| }, |
| { |
| "epoch": 9.835672304186158, |
| "grad_norm": 0.5744655132293701, |
| "learning_rate": 8.216846126166026e-07, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1091577088, |
| "step": 1066000 |
| }, |
| { |
| "epoch": 9.840285658925458, |
| "grad_norm": 4.304238319396973, |
| "learning_rate": 7.986178389201059e-07, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 1092089088, |
| "step": 1066500 |
| }, |
| { |
| "epoch": 9.844899013664756, |
| "grad_norm": 0.7492998838424683, |
| "learning_rate": 7.755510652236094e-07, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 1092601088, |
| "step": 1067000 |
| }, |
| { |
| "epoch": 9.849512368404056, |
| "grad_norm": 0.21370269358158112, |
| "learning_rate": 7.524842915271127e-07, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1093113088, |
| "step": 1067500 |
| }, |
| { |
| "epoch": 9.854125723143355, |
| "grad_norm": 1.3890074491500854, |
| "learning_rate": 7.294175178306161e-07, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1093625088, |
| "step": 1068000 |
| }, |
| { |
| "epoch": 9.858739077882655, |
| "grad_norm": 0.9255247116088867, |
| "learning_rate": 7.063507441341195e-07, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1094137088, |
| "step": 1068500 |
| }, |
| { |
| "epoch": 9.863352432621953, |
| "grad_norm": 0.617211639881134, |
| "learning_rate": 6.832839704376229e-07, |
| "loss": 0.0118, |
| "num_input_tokens_seen": 1094649088, |
| "step": 1069000 |
| }, |
| { |
| "epoch": 9.867965787361253, |
| "grad_norm": 0.7818981409072876, |
| "learning_rate": 6.602171967411263e-07, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1095161088, |
| "step": 1069500 |
| }, |
| { |
| "epoch": 9.872579142100552, |
| "grad_norm": 0.7910097241401672, |
| "learning_rate": 6.371504230446296e-07, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1095673088, |
| "step": 1070000 |
| }, |
| { |
| "epoch": 9.877192496839852, |
| "grad_norm": 0.9167271256446838, |
| "learning_rate": 6.14083649348133e-07, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 1096185088, |
| "step": 1070500 |
| }, |
| { |
| "epoch": 9.881805851579152, |
| "grad_norm": 0.4515294134616852, |
| "learning_rate": 5.910168756516364e-07, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 1096697088, |
| "step": 1071000 |
| }, |
| { |
| "epoch": 9.88641920631845, |
| "grad_norm": 1.4242569208145142, |
| "learning_rate": 5.679501019551397e-07, |
| "loss": 0.0123, |
| "num_input_tokens_seen": 1097209088, |
| "step": 1071500 |
| }, |
| { |
| "epoch": 9.89103256105775, |
| "grad_norm": 1.5031037330627441, |
| "learning_rate": 5.448833282586431e-07, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1097721088, |
| "step": 1072000 |
| }, |
| { |
| "epoch": 9.895645915797049, |
| "grad_norm": 0.5102546215057373, |
| "learning_rate": 5.218165545621465e-07, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 1098233088, |
| "step": 1072500 |
| }, |
| { |
| "epoch": 9.900259270536349, |
| "grad_norm": 0.5648242831230164, |
| "learning_rate": 4.987497808656499e-07, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1098745088, |
| "step": 1073000 |
| }, |
| { |
| "epoch": 9.904872625275647, |
| "grad_norm": 1.368865728378296, |
| "learning_rate": 4.756830071691533e-07, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 1099257088, |
| "step": 1073500 |
| }, |
| { |
| "epoch": 9.909485980014948, |
| "grad_norm": 0.372745156288147, |
| "learning_rate": 4.5261623347265665e-07, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 1099769088, |
| "step": 1074000 |
| }, |
| { |
| "epoch": 9.914099334754246, |
| "grad_norm": 0.5571704506874084, |
| "learning_rate": 4.2954945977616003e-07, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1100281088, |
| "step": 1074500 |
| }, |
| { |
| "epoch": 9.918712689493546, |
| "grad_norm": 0.44755375385284424, |
| "learning_rate": 4.064826860796634e-07, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 1100793088, |
| "step": 1075000 |
| }, |
| { |
| "epoch": 9.923326044232844, |
| "grad_norm": 0.467204749584198, |
| "learning_rate": 3.834159123831668e-07, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 1101305088, |
| "step": 1075500 |
| }, |
| { |
| "epoch": 9.927939398972145, |
| "grad_norm": 1.1227315664291382, |
| "learning_rate": 3.603491386866702e-07, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 1101817088, |
| "step": 1076000 |
| }, |
| { |
| "epoch": 9.932552753711445, |
| "grad_norm": 0.8583968877792358, |
| "learning_rate": 3.3728236499017353e-07, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 1102329088, |
| "step": 1076500 |
| }, |
| { |
| "epoch": 9.937166108450743, |
| "grad_norm": 0.830702543258667, |
| "learning_rate": 3.14215591293677e-07, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1102841088, |
| "step": 1077000 |
| }, |
| { |
| "epoch": 9.941779463190043, |
| "grad_norm": 1.864600658416748, |
| "learning_rate": 2.9114881759718036e-07, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1103353088, |
| "step": 1077500 |
| }, |
| { |
| "epoch": 9.946392817929341, |
| "grad_norm": 0.8975169658660889, |
| "learning_rate": 2.680820439006837e-07, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 1103865088, |
| "step": 1078000 |
| }, |
| { |
| "epoch": 9.951006172668642, |
| "grad_norm": 0.7767340540885925, |
| "learning_rate": 2.450152702041871e-07, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1104377088, |
| "step": 1078500 |
| }, |
| { |
| "epoch": 9.95561952740794, |
| "grad_norm": 0.6193325519561768, |
| "learning_rate": 2.2194849650769047e-07, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 1104889088, |
| "step": 1079000 |
| }, |
| { |
| "epoch": 9.96023288214724, |
| "grad_norm": 1.1023420095443726, |
| "learning_rate": 1.9888172281119386e-07, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 1105401088, |
| "step": 1079500 |
| }, |
| { |
| "epoch": 9.964846236886538, |
| "grad_norm": 1.2743160724639893, |
| "learning_rate": 1.7581494911469725e-07, |
| "loss": 0.0119, |
| "num_input_tokens_seen": 1105913088, |
| "step": 1080000 |
| }, |
| { |
| "epoch": 9.969459591625839, |
| "grad_norm": 0.7009992599487305, |
| "learning_rate": 1.527481754182006e-07, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 1106425088, |
| "step": 1080500 |
| }, |
| { |
| "epoch": 9.974072946365137, |
| "grad_norm": 0.5736069679260254, |
| "learning_rate": 1.29681401721704e-07, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 1106937088, |
| "step": 1081000 |
| }, |
| { |
| "epoch": 9.978686301104437, |
| "grad_norm": 0.4789179861545563, |
| "learning_rate": 1.0661462802520738e-07, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 1107449088, |
| "step": 1081500 |
| }, |
| { |
| "epoch": 9.983299655843737, |
| "grad_norm": 0.7064932584762573, |
| "learning_rate": 8.354785432871076e-08, |
| "loss": 0.0122, |
| "num_input_tokens_seen": 1107961088, |
| "step": 1082000 |
| }, |
| { |
| "epoch": 9.987913010583036, |
| "grad_norm": 1.0066189765930176, |
| "learning_rate": 6.048108063221414e-08, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 1108473088, |
| "step": 1082500 |
| }, |
| { |
| "epoch": 9.992526365322336, |
| "grad_norm": 1.61360502243042, |
| "learning_rate": 3.7414306935717514e-08, |
| "loss": 0.0135, |
| "num_input_tokens_seen": 1108985088, |
| "step": 1083000 |
| }, |
| { |
| "epoch": 9.997139720061634, |
| "grad_norm": 0.37303218245506287, |
| "learning_rate": 1.4347533239220898e-08, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1109497088, |
| "step": 1083500 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_combined_score": 0.06429717740844736, |
| "eval_loss": 0.06429717689752579, |
| "eval_mse": 0.06429717791936893, |
| "eval_runtime": 46.2743, |
| "eval_samples_per_second": 2081.892, |
| "eval_steps_per_second": 260.253, |
| "num_input_tokens_seen": 1109813760, |
| "step": 1083810 |
| }, |
| { |
| "epoch": 10.0, |
| "num_input_tokens_seen": 1109813760, |
| "step": 1083810, |
| "total_flos": 1.4278349548463616e+17, |
| "train_loss": 0.035630166295778455, |
| "train_runtime": 37672.0963, |
| "train_samples_per_second": 230.155, |
| "train_steps_per_second": 28.77, |
| "train_tokens_per_second": 29459.836 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 1083810, |
| "num_input_tokens_seen": 1109813760, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.4278349548463616e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|