Nekochu's picture
Add Llama-3.1-8B-German-ORPO
968b1be verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 93654,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003203280158882696,
"grad_norm": 0.9955105781555176,
"learning_rate": 5e-06,
"loss": 1.2173,
"num_input_tokens_seen": 819200,
"step": 100
},
{
"epoch": 0.006406560317765392,
"grad_norm": 8.62949275970459,
"learning_rate": 1e-05,
"loss": 1.1953,
"num_input_tokens_seen": 1638400,
"step": 200
},
{
"epoch": 0.009609840476648087,
"grad_norm": 1.0293811559677124,
"learning_rate": 1.5e-05,
"loss": 1.1905,
"num_input_tokens_seen": 2457600,
"step": 300
},
{
"epoch": 0.012813120635530783,
"grad_norm": 6.295543193817139,
"learning_rate": 2e-05,
"loss": 1.1391,
"num_input_tokens_seen": 3276800,
"step": 400
},
{
"epoch": 0.01601640079441348,
"grad_norm": 3.0551528930664062,
"learning_rate": 2.5e-05,
"loss": 1.1383,
"num_input_tokens_seen": 4096000,
"step": 500
},
{
"epoch": 0.019219680953296174,
"grad_norm": 0.8111634850502014,
"learning_rate": 3e-05,
"loss": 1.1022,
"num_input_tokens_seen": 4915200,
"step": 600
},
{
"epoch": 0.022422961112178872,
"grad_norm": 0.77763432264328,
"learning_rate": 3.5e-05,
"loss": 1.0805,
"num_input_tokens_seen": 5734400,
"step": 700
},
{
"epoch": 0.025626241271061567,
"grad_norm": 1.9141496419906616,
"learning_rate": 4e-05,
"loss": 1.0755,
"num_input_tokens_seen": 6553600,
"step": 800
},
{
"epoch": 0.028829521429944265,
"grad_norm": 0.8061490058898926,
"learning_rate": 4.5e-05,
"loss": 1.0995,
"num_input_tokens_seen": 7372800,
"step": 900
},
{
"epoch": 0.03203280158882696,
"grad_norm": 0.6671661734580994,
"learning_rate": 5e-05,
"loss": 1.0835,
"num_input_tokens_seen": 8192000,
"step": 1000
},
{
"epoch": 0.035236081747709654,
"grad_norm": 2.4559221267700195,
"learning_rate": 4.9999856291983216e-05,
"loss": 1.0848,
"num_input_tokens_seen": 9011200,
"step": 1100
},
{
"epoch": 0.03843936190659235,
"grad_norm": 0.6218218803405762,
"learning_rate": 4.9999425169585025e-05,
"loss": 1.0621,
"num_input_tokens_seen": 9830400,
"step": 1200
},
{
"epoch": 0.04164264206547505,
"grad_norm": 1.1977851390838623,
"learning_rate": 4.999870663776188e-05,
"loss": 1.0774,
"num_input_tokens_seen": 10649600,
"step": 1300
},
{
"epoch": 0.044845922224357744,
"grad_norm": 0.581513524055481,
"learning_rate": 4.99977007047745e-05,
"loss": 1.0204,
"num_input_tokens_seen": 11468800,
"step": 1400
},
{
"epoch": 0.04804920238324044,
"grad_norm": 0.6710864901542664,
"learning_rate": 4.999640738218772e-05,
"loss": 1.0509,
"num_input_tokens_seen": 12288000,
"step": 1500
},
{
"epoch": 0.05125248254212313,
"grad_norm": 2.048499345779419,
"learning_rate": 4.99948266848704e-05,
"loss": 1.1401,
"num_input_tokens_seen": 13107200,
"step": 1600
},
{
"epoch": 0.05445576270100583,
"grad_norm": 0.6593829989433289,
"learning_rate": 4.999295863099528e-05,
"loss": 1.042,
"num_input_tokens_seen": 13926400,
"step": 1700
},
{
"epoch": 0.05765904285988853,
"grad_norm": 0.5166763663291931,
"learning_rate": 4.999080324203867e-05,
"loss": 1.1398,
"num_input_tokens_seen": 14745600,
"step": 1800
},
{
"epoch": 0.060862323018771224,
"grad_norm": 0.4539300203323364,
"learning_rate": 4.9988360542780333e-05,
"loss": 1.0759,
"num_input_tokens_seen": 15564800,
"step": 1900
},
{
"epoch": 0.06406560317765392,
"grad_norm": 0.7282894253730774,
"learning_rate": 4.998563056130308e-05,
"loss": 1.0988,
"num_input_tokens_seen": 16384000,
"step": 2000
},
{
"epoch": 0.06726888333653662,
"grad_norm": 0.6337546706199646,
"learning_rate": 4.998261332899255e-05,
"loss": 1.0642,
"num_input_tokens_seen": 17203200,
"step": 2100
},
{
"epoch": 0.07047216349541931,
"grad_norm": 0.6283242702484131,
"learning_rate": 4.997930888053677e-05,
"loss": 1.076,
"num_input_tokens_seen": 18022400,
"step": 2200
},
{
"epoch": 0.07367544365430201,
"grad_norm": 0.6066380739212036,
"learning_rate": 4.99757172539258e-05,
"loss": 1.0616,
"num_input_tokens_seen": 18841600,
"step": 2300
},
{
"epoch": 0.0768787238131847,
"grad_norm": 0.506839394569397,
"learning_rate": 4.997183849045129e-05,
"loss": 1.0691,
"num_input_tokens_seen": 19660800,
"step": 2400
},
{
"epoch": 0.0800820039720674,
"grad_norm": 0.6370711922645569,
"learning_rate": 4.996767263470599e-05,
"loss": 1.0463,
"num_input_tokens_seen": 20480000,
"step": 2500
},
{
"epoch": 0.0832852841309501,
"grad_norm": 2.0462234020233154,
"learning_rate": 4.996321973458325e-05,
"loss": 1.0703,
"num_input_tokens_seen": 21299200,
"step": 2600
},
{
"epoch": 0.08648856428983279,
"grad_norm": 0.6036199331283569,
"learning_rate": 4.9958479841276446e-05,
"loss": 1.0397,
"num_input_tokens_seen": 22118400,
"step": 2700
},
{
"epoch": 0.08969184444871549,
"grad_norm": 0.6303982138633728,
"learning_rate": 4.995345300927845e-05,
"loss": 1.0837,
"num_input_tokens_seen": 22937600,
"step": 2800
},
{
"epoch": 0.09289512460759818,
"grad_norm": 0.5572041869163513,
"learning_rate": 4.994813929638096e-05,
"loss": 1.0399,
"num_input_tokens_seen": 23756800,
"step": 2900
},
{
"epoch": 0.09609840476648088,
"grad_norm": 0.6958311200141907,
"learning_rate": 4.9942538763673794e-05,
"loss": 1.0634,
"num_input_tokens_seen": 24576000,
"step": 3000
},
{
"epoch": 0.09930168492536358,
"grad_norm": 0.583613395690918,
"learning_rate": 4.993665147554429e-05,
"loss": 1.0472,
"num_input_tokens_seen": 25395200,
"step": 3100
},
{
"epoch": 0.10250496508424627,
"grad_norm": 0.5093560814857483,
"learning_rate": 4.9930477499676495e-05,
"loss": 1.0774,
"num_input_tokens_seen": 26214400,
"step": 3200
},
{
"epoch": 0.10570824524312897,
"grad_norm": 1.930864691734314,
"learning_rate": 4.992401690705038e-05,
"loss": 1.0402,
"num_input_tokens_seen": 27033600,
"step": 3300
},
{
"epoch": 0.10891152540201166,
"grad_norm": 0.6102778911590576,
"learning_rate": 4.9917269771941056e-05,
"loss": 1.0353,
"num_input_tokens_seen": 27852800,
"step": 3400
},
{
"epoch": 0.11211480556089436,
"grad_norm": 0.5592427849769592,
"learning_rate": 4.991023617191792e-05,
"loss": 1.0776,
"num_input_tokens_seen": 28672000,
"step": 3500
},
{
"epoch": 0.11531808571977706,
"grad_norm": 0.6671651005744934,
"learning_rate": 4.990291618784377e-05,
"loss": 1.1083,
"num_input_tokens_seen": 29491200,
"step": 3600
},
{
"epoch": 0.11852136587865975,
"grad_norm": 1.4246577024459839,
"learning_rate": 4.989530990387381e-05,
"loss": 1.0262,
"num_input_tokens_seen": 30310400,
"step": 3700
},
{
"epoch": 0.12172464603754245,
"grad_norm": 2.4318628311157227,
"learning_rate": 4.988741740745477e-05,
"loss": 1.0441,
"num_input_tokens_seen": 31129600,
"step": 3800
},
{
"epoch": 0.12492792619642513,
"grad_norm": 2.1933786869049072,
"learning_rate": 4.987923878932386e-05,
"loss": 1.0375,
"num_input_tokens_seen": 31948800,
"step": 3900
},
{
"epoch": 0.12813120635530784,
"grad_norm": 0.5265761017799377,
"learning_rate": 4.9870774143507696e-05,
"loss": 1.0041,
"num_input_tokens_seen": 32768000,
"step": 4000
},
{
"epoch": 0.13133448651419052,
"grad_norm": 0.6378248929977417,
"learning_rate": 4.98620235673213e-05,
"loss": 1.0798,
"num_input_tokens_seen": 33587200,
"step": 4100
},
{
"epoch": 0.13453776667307324,
"grad_norm": 0.5426807999610901,
"learning_rate": 4.9852987161366895e-05,
"loss": 1.1014,
"num_input_tokens_seen": 34406400,
"step": 4200
},
{
"epoch": 0.13774104683195593,
"grad_norm": 0.587978720664978,
"learning_rate": 4.9843665029532796e-05,
"loss": 1.0321,
"num_input_tokens_seen": 35225600,
"step": 4300
},
{
"epoch": 0.14094432699083861,
"grad_norm": 0.8025338649749756,
"learning_rate": 4.983405727899221e-05,
"loss": 0.9954,
"num_input_tokens_seen": 36044800,
"step": 4400
},
{
"epoch": 0.1441476071497213,
"grad_norm": 0.5788518786430359,
"learning_rate": 4.982416402020201e-05,
"loss": 1.0049,
"num_input_tokens_seen": 36864000,
"step": 4500
},
{
"epoch": 0.14735088730860402,
"grad_norm": 0.629861056804657,
"learning_rate": 4.9813985366901435e-05,
"loss": 1.0586,
"num_input_tokens_seen": 37683200,
"step": 4600
},
{
"epoch": 0.1505541674674867,
"grad_norm": 0.5835918188095093,
"learning_rate": 4.980352143611081e-05,
"loss": 1.0949,
"num_input_tokens_seen": 38502400,
"step": 4700
},
{
"epoch": 0.1537574476263694,
"grad_norm": 0.5552580952644348,
"learning_rate": 4.979277234813021e-05,
"loss": 1.0374,
"num_input_tokens_seen": 39321600,
"step": 4800
},
{
"epoch": 0.1569607277852521,
"grad_norm": 0.7137876749038696,
"learning_rate": 4.978173822653802e-05,
"loss": 1.0195,
"num_input_tokens_seen": 40140800,
"step": 4900
},
{
"epoch": 0.1601640079441348,
"grad_norm": 0.6314465403556824,
"learning_rate": 4.9770419198189595e-05,
"loss": 1.0661,
"num_input_tokens_seen": 40960000,
"step": 5000
},
{
"epoch": 0.16336728810301748,
"grad_norm": 0.5494422316551208,
"learning_rate": 4.975881539321574e-05,
"loss": 1.0168,
"num_input_tokens_seen": 41779200,
"step": 5100
},
{
"epoch": 0.1665705682619002,
"grad_norm": 2.2284624576568604,
"learning_rate": 4.974692694502123e-05,
"loss": 1.0523,
"num_input_tokens_seen": 42598400,
"step": 5200
},
{
"epoch": 0.16977384842078289,
"grad_norm": 0.5189602375030518,
"learning_rate": 4.973475399028331e-05,
"loss": 1.0294,
"num_input_tokens_seen": 43417600,
"step": 5300
},
{
"epoch": 0.17297712857966557,
"grad_norm": 2.1537561416625977,
"learning_rate": 4.972229666895006e-05,
"loss": 0.9866,
"num_input_tokens_seen": 44236800,
"step": 5400
},
{
"epoch": 0.17618040873854826,
"grad_norm": 0.5834473967552185,
"learning_rate": 4.970955512423884e-05,
"loss": 0.99,
"num_input_tokens_seen": 45056000,
"step": 5500
},
{
"epoch": 0.17938368889743098,
"grad_norm": 0.6151788830757141,
"learning_rate": 4.969652950263462e-05,
"loss": 1.0292,
"num_input_tokens_seen": 45875200,
"step": 5600
},
{
"epoch": 0.18258696905631366,
"grad_norm": 0.641342043876648,
"learning_rate": 4.96832199538883e-05,
"loss": 1.0712,
"num_input_tokens_seen": 46694400,
"step": 5700
},
{
"epoch": 0.18579024921519635,
"grad_norm": 0.7882746458053589,
"learning_rate": 4.966962663101499e-05,
"loss": 1.0279,
"num_input_tokens_seen": 47513600,
"step": 5800
},
{
"epoch": 0.18899352937407907,
"grad_norm": 0.633734405040741,
"learning_rate": 4.965574969029223e-05,
"loss": 1.0448,
"num_input_tokens_seen": 48332800,
"step": 5900
},
{
"epoch": 0.19219680953296175,
"grad_norm": 1.5470919609069824,
"learning_rate": 4.9641589291258255e-05,
"loss": 1.0492,
"num_input_tokens_seen": 49152000,
"step": 6000
},
{
"epoch": 0.19540008969184444,
"grad_norm": 1.6563118696212769,
"learning_rate": 4.962714559671008e-05,
"loss": 1.0593,
"num_input_tokens_seen": 49971200,
"step": 6100
},
{
"epoch": 0.19860336985072716,
"grad_norm": 0.6741557717323303,
"learning_rate": 4.961241877270169e-05,
"loss": 1.0054,
"num_input_tokens_seen": 50790400,
"step": 6200
},
{
"epoch": 0.20180665000960984,
"grad_norm": 0.6842678785324097,
"learning_rate": 4.9597408988542096e-05,
"loss": 0.9865,
"num_input_tokens_seen": 51609600,
"step": 6300
},
{
"epoch": 0.20500993016849253,
"grad_norm": 8.189310073852539,
"learning_rate": 4.958211641679339e-05,
"loss": 1.0529,
"num_input_tokens_seen": 52428800,
"step": 6400
},
{
"epoch": 0.20821321032737522,
"grad_norm": 0.8904711604118347,
"learning_rate": 4.956654123326881e-05,
"loss": 1.0272,
"num_input_tokens_seen": 53248000,
"step": 6500
},
{
"epoch": 0.21141649048625794,
"grad_norm": 0.7857553362846375,
"learning_rate": 4.9550683617030624e-05,
"loss": 1.0295,
"num_input_tokens_seen": 54067200,
"step": 6600
},
{
"epoch": 0.21461977064514062,
"grad_norm": 0.6658555865287781,
"learning_rate": 4.9534543750388185e-05,
"loss": 0.9849,
"num_input_tokens_seen": 54886400,
"step": 6700
},
{
"epoch": 0.2178230508040233,
"grad_norm": 0.6390406489372253,
"learning_rate": 4.951812181889573e-05,
"loss": 0.9597,
"num_input_tokens_seen": 55705600,
"step": 6800
},
{
"epoch": 0.22102633096290603,
"grad_norm": 0.5161400437355042,
"learning_rate": 4.950141801135034e-05,
"loss": 1.0008,
"num_input_tokens_seen": 56524800,
"step": 6900
},
{
"epoch": 0.2242296111217887,
"grad_norm": 0.7651511430740356,
"learning_rate": 4.948443251978968e-05,
"loss": 0.9889,
"num_input_tokens_seen": 57344000,
"step": 7000
},
{
"epoch": 0.2274328912806714,
"grad_norm": 0.5069282054901123,
"learning_rate": 4.946716553948987e-05,
"loss": 0.9869,
"num_input_tokens_seen": 58163200,
"step": 7100
},
{
"epoch": 0.23063617143955412,
"grad_norm": 0.5041384696960449,
"learning_rate": 4.9449617268963164e-05,
"loss": 0.9669,
"num_input_tokens_seen": 58982400,
"step": 7200
},
{
"epoch": 0.2338394515984368,
"grad_norm": 1.7203638553619385,
"learning_rate": 4.943178790995576e-05,
"loss": 1.0426,
"num_input_tokens_seen": 59801600,
"step": 7300
},
{
"epoch": 0.2370427317573195,
"grad_norm": 0.8364699482917786,
"learning_rate": 4.941367766744539e-05,
"loss": 0.9894,
"num_input_tokens_seen": 60620800,
"step": 7400
},
{
"epoch": 0.24024601191620218,
"grad_norm": 0.42120370268821716,
"learning_rate": 4.939528674963902e-05,
"loss": 0.996,
"num_input_tokens_seen": 61440000,
"step": 7500
},
{
"epoch": 0.2434492920750849,
"grad_norm": 4.017838001251221,
"learning_rate": 4.937661536797044e-05,
"loss": 1.0557,
"num_input_tokens_seen": 62259200,
"step": 7600
},
{
"epoch": 0.24665257223396758,
"grad_norm": 0.7951923608779907,
"learning_rate": 4.9357663737097824e-05,
"loss": 1.0614,
"num_input_tokens_seen": 63078400,
"step": 7700
},
{
"epoch": 0.24985585239285027,
"grad_norm": 0.7139900922775269,
"learning_rate": 4.9338432074901276e-05,
"loss": 1.0525,
"num_input_tokens_seen": 63897600,
"step": 7800
},
{
"epoch": 0.25305913255173296,
"grad_norm": 0.6686214208602905,
"learning_rate": 4.931892060248032e-05,
"loss": 1.0947,
"num_input_tokens_seen": 64716800,
"step": 7900
},
{
"epoch": 0.2562624127106157,
"grad_norm": 0.737429678440094,
"learning_rate": 4.929912954415135e-05,
"loss": 0.9886,
"num_input_tokens_seen": 65536000,
"step": 8000
},
{
"epoch": 0.2594656928694984,
"grad_norm": 0.49794241786003113,
"learning_rate": 4.9279059127445074e-05,
"loss": 1.0407,
"num_input_tokens_seen": 66355200,
"step": 8100
},
{
"epoch": 0.26266897302838105,
"grad_norm": 0.6615239977836609,
"learning_rate": 4.925870958310388e-05,
"loss": 1.021,
"num_input_tokens_seen": 67174400,
"step": 8200
},
{
"epoch": 0.26587225318726376,
"grad_norm": 1.568616509437561,
"learning_rate": 4.923808114507916e-05,
"loss": 1.027,
"num_input_tokens_seen": 67993600,
"step": 8300
},
{
"epoch": 0.2690755333461465,
"grad_norm": 0.6627603769302368,
"learning_rate": 4.921717405052868e-05,
"loss": 1.0552,
"num_input_tokens_seen": 68812800,
"step": 8400
},
{
"epoch": 0.27227881350502914,
"grad_norm": 0.5849776864051819,
"learning_rate": 4.9195988539813814e-05,
"loss": 1.0552,
"num_input_tokens_seen": 69632000,
"step": 8500
},
{
"epoch": 0.27548209366391185,
"grad_norm": 1.6558514833450317,
"learning_rate": 4.917452485649677e-05,
"loss": 1.0516,
"num_input_tokens_seen": 70451200,
"step": 8600
},
{
"epoch": 0.27868537382279457,
"grad_norm": 0.5784972310066223,
"learning_rate": 4.9152783247337823e-05,
"loss": 1.0425,
"num_input_tokens_seen": 71270400,
"step": 8700
},
{
"epoch": 0.28188865398167723,
"grad_norm": 0.713585376739502,
"learning_rate": 4.9130763962292453e-05,
"loss": 1.0633,
"num_input_tokens_seen": 72089600,
"step": 8800
},
{
"epoch": 0.28509193414055994,
"grad_norm": 0.678617000579834,
"learning_rate": 4.9108467254508487e-05,
"loss": 1.0208,
"num_input_tokens_seen": 72908800,
"step": 8900
},
{
"epoch": 0.2882952142994426,
"grad_norm": 0.6494852900505066,
"learning_rate": 4.908589338032316e-05,
"loss": 1.0193,
"num_input_tokens_seen": 73728000,
"step": 9000
},
{
"epoch": 0.2914984944583253,
"grad_norm": 0.6913178563117981,
"learning_rate": 4.9063042599260234e-05,
"loss": 0.9783,
"num_input_tokens_seen": 74547200,
"step": 9100
},
{
"epoch": 0.29470177461720803,
"grad_norm": 0.6419298648834229,
"learning_rate": 4.9039915174026916e-05,
"loss": 1.0251,
"num_input_tokens_seen": 75366400,
"step": 9200
},
{
"epoch": 0.2979050547760907,
"grad_norm": 0.6663874983787537,
"learning_rate": 4.9016511370510945e-05,
"loss": 1.009,
"num_input_tokens_seen": 76185600,
"step": 9300
},
{
"epoch": 0.3011083349349734,
"grad_norm": 0.5730396509170532,
"learning_rate": 4.8992831457777446e-05,
"loss": 1.0154,
"num_input_tokens_seen": 77004800,
"step": 9400
},
{
"epoch": 0.3043116150938561,
"grad_norm": 0.5048360824584961,
"learning_rate": 4.896887570806588e-05,
"loss": 1.0498,
"num_input_tokens_seen": 77824000,
"step": 9500
},
{
"epoch": 0.3075148952527388,
"grad_norm": 1.7296109199523926,
"learning_rate": 4.89446443967869e-05,
"loss": 1.0426,
"num_input_tokens_seen": 78643200,
"step": 9600
},
{
"epoch": 0.3107181754116215,
"grad_norm": 0.8863735198974609,
"learning_rate": 4.892013780251922e-05,
"loss": 0.9947,
"num_input_tokens_seen": 79462400,
"step": 9700
},
{
"epoch": 0.3139214555705042,
"grad_norm": 2.7898573875427246,
"learning_rate": 4.889535620700635e-05,
"loss": 1.0301,
"num_input_tokens_seen": 80281600,
"step": 9800
},
{
"epoch": 0.3171247357293869,
"grad_norm": 0.5569226741790771,
"learning_rate": 4.887029989515341e-05,
"loss": 0.976,
"num_input_tokens_seen": 81100800,
"step": 9900
},
{
"epoch": 0.3203280158882696,
"grad_norm": 0.46732258796691895,
"learning_rate": 4.884496915502385e-05,
"loss": 1.0477,
"num_input_tokens_seen": 81920000,
"step": 10000
},
{
"epoch": 0.3235312960471523,
"grad_norm": 0.45553821325302124,
"learning_rate": 4.881936427783607e-05,
"loss": 1.0019,
"num_input_tokens_seen": 82739200,
"step": 10100
},
{
"epoch": 0.32673457620603497,
"grad_norm": 0.7193503379821777,
"learning_rate": 4.879348555796018e-05,
"loss": 0.997,
"num_input_tokens_seen": 83558400,
"step": 10200
},
{
"epoch": 0.3299378563649177,
"grad_norm": 0.6309390664100647,
"learning_rate": 4.8767333292914544e-05,
"loss": 0.9891,
"num_input_tokens_seen": 84377600,
"step": 10300
},
{
"epoch": 0.3331411365238004,
"grad_norm": 0.555618166923523,
"learning_rate": 4.874090778336235e-05,
"loss": 1.0175,
"num_input_tokens_seen": 85196800,
"step": 10400
},
{
"epoch": 0.33634441668268306,
"grad_norm": 1.5369619131088257,
"learning_rate": 4.8714209333108236e-05,
"loss": 1.0151,
"num_input_tokens_seen": 86016000,
"step": 10500
},
{
"epoch": 0.33954769684156577,
"grad_norm": 0.5254389047622681,
"learning_rate": 4.868723824909469e-05,
"loss": 1.025,
"num_input_tokens_seen": 86835200,
"step": 10600
},
{
"epoch": 0.3427509770004485,
"grad_norm": 0.5323970913887024,
"learning_rate": 4.8659994841398594e-05,
"loss": 1.0334,
"num_input_tokens_seen": 87654400,
"step": 10700
},
{
"epoch": 0.34595425715933115,
"grad_norm": 0.602602481842041,
"learning_rate": 4.863247942322764e-05,
"loss": 1.0237,
"num_input_tokens_seen": 88473600,
"step": 10800
},
{
"epoch": 0.34915753731821386,
"grad_norm": 2.1106760501861572,
"learning_rate": 4.860469231091671e-05,
"loss": 1.0181,
"num_input_tokens_seen": 89292800,
"step": 10900
},
{
"epoch": 0.3523608174770965,
"grad_norm": 0.6294669508934021,
"learning_rate": 4.857663382392428e-05,
"loss": 1.0289,
"num_input_tokens_seen": 90112000,
"step": 11000
},
{
"epoch": 0.35556409763597924,
"grad_norm": 0.5473527908325195,
"learning_rate": 4.854830428482871e-05,
"loss": 1.0296,
"num_input_tokens_seen": 90931200,
"step": 11100
},
{
"epoch": 0.35876737779486195,
"grad_norm": 0.5963702201843262,
"learning_rate": 4.851970401932454e-05,
"loss": 0.9784,
"num_input_tokens_seen": 91750400,
"step": 11200
},
{
"epoch": 0.3619706579537446,
"grad_norm": 1.5987745523452759,
"learning_rate": 4.849083335621878e-05,
"loss": 1.0842,
"num_input_tokens_seen": 92569600,
"step": 11300
},
{
"epoch": 0.3651739381126273,
"grad_norm": 1.9906154870986938,
"learning_rate": 4.846169262742709e-05,
"loss": 1.0196,
"num_input_tokens_seen": 93388800,
"step": 11400
},
{
"epoch": 0.36837721827151004,
"grad_norm": 0.7897935509681702,
"learning_rate": 4.843228216796996e-05,
"loss": 1.0103,
"num_input_tokens_seen": 94208000,
"step": 11500
},
{
"epoch": 0.3715804984303927,
"grad_norm": 0.6737790107727051,
"learning_rate": 4.8402602315968905e-05,
"loss": 1.0551,
"num_input_tokens_seen": 95027200,
"step": 11600
},
{
"epoch": 0.3747837785892754,
"grad_norm": 0.5573664307594299,
"learning_rate": 4.837265341264253e-05,
"loss": 1.0221,
"num_input_tokens_seen": 95846400,
"step": 11700
},
{
"epoch": 0.37798705874815813,
"grad_norm": 0.6558005809783936,
"learning_rate": 4.834243580230266e-05,
"loss": 0.975,
"num_input_tokens_seen": 96665600,
"step": 11800
},
{
"epoch": 0.3811903389070408,
"grad_norm": 0.7646604776382446,
"learning_rate": 4.831194983235029e-05,
"loss": 1.0152,
"num_input_tokens_seen": 97484800,
"step": 11900
},
{
"epoch": 0.3843936190659235,
"grad_norm": 0.5662313103675842,
"learning_rate": 4.82811958532717e-05,
"loss": 0.9909,
"num_input_tokens_seen": 98304000,
"step": 12000
},
{
"epoch": 0.3875968992248062,
"grad_norm": 0.5597667098045349,
"learning_rate": 4.825017421863436e-05,
"loss": 1.0208,
"num_input_tokens_seen": 99123200,
"step": 12100
},
{
"epoch": 0.3908001793836889,
"grad_norm": 0.5832675099372864,
"learning_rate": 4.821888528508287e-05,
"loss": 1.0189,
"num_input_tokens_seen": 99942400,
"step": 12200
},
{
"epoch": 0.3940034595425716,
"grad_norm": 1.6424989700317383,
"learning_rate": 4.8187329412334884e-05,
"loss": 1.055,
"num_input_tokens_seen": 100761600,
"step": 12300
},
{
"epoch": 0.3972067397014543,
"grad_norm": 0.4590611755847931,
"learning_rate": 4.815550696317695e-05,
"loss": 1.0586,
"num_input_tokens_seen": 101580800,
"step": 12400
},
{
"epoch": 0.400410019860337,
"grad_norm": 0.5123792290687561,
"learning_rate": 4.812341830346035e-05,
"loss": 1.0073,
"num_input_tokens_seen": 102400000,
"step": 12500
},
{
"epoch": 0.4036133000192197,
"grad_norm": 1.7758103609085083,
"learning_rate": 4.80910638020969e-05,
"loss": 1.0012,
"num_input_tokens_seen": 103219200,
"step": 12600
},
{
"epoch": 0.40681658017810235,
"grad_norm": 0.6465420722961426,
"learning_rate": 4.805844383105469e-05,
"loss": 0.9919,
"num_input_tokens_seen": 104038400,
"step": 12700
},
{
"epoch": 0.41001986033698506,
"grad_norm": 0.6052021980285645,
"learning_rate": 4.802555876535383e-05,
"loss": 1.0369,
"num_input_tokens_seen": 104857600,
"step": 12800
},
{
"epoch": 0.4132231404958678,
"grad_norm": 0.5069152116775513,
"learning_rate": 4.799240898306214e-05,
"loss": 1.0105,
"num_input_tokens_seen": 105676800,
"step": 12900
},
{
"epoch": 0.41642642065475044,
"grad_norm": 0.6421388387680054,
"learning_rate": 4.7958994865290766e-05,
"loss": 0.9861,
"num_input_tokens_seen": 106496000,
"step": 13000
},
{
"epoch": 0.41962970081363316,
"grad_norm": 0.6774849891662598,
"learning_rate": 4.7925316796189826e-05,
"loss": 0.9771,
"num_input_tokens_seen": 107315200,
"step": 13100
},
{
"epoch": 0.42283298097251587,
"grad_norm": 2.159661293029785,
"learning_rate": 4.789137516294402e-05,
"loss": 1.0182,
"num_input_tokens_seen": 108134400,
"step": 13200
},
{
"epoch": 0.42603626113139853,
"grad_norm": 0.6035510301589966,
"learning_rate": 4.785717035576812e-05,
"loss": 1.036,
"num_input_tokens_seen": 108953600,
"step": 13300
},
{
"epoch": 0.42923954129028125,
"grad_norm": 1.6665889024734497,
"learning_rate": 4.782270276790254e-05,
"loss": 1.0713,
"num_input_tokens_seen": 109772800,
"step": 13400
},
{
"epoch": 0.43244282144916396,
"grad_norm": 0.702918291091919,
"learning_rate": 4.778797279560876e-05,
"loss": 0.9708,
"num_input_tokens_seen": 110592000,
"step": 13500
},
{
"epoch": 0.4356461016080466,
"grad_norm": 0.6358348727226257,
"learning_rate": 4.775298083816482e-05,
"loss": 0.9967,
"num_input_tokens_seen": 111411200,
"step": 13600
},
{
"epoch": 0.43884938176692934,
"grad_norm": 0.652087390422821,
"learning_rate": 4.77177272978607e-05,
"loss": 1.0333,
"num_input_tokens_seen": 112230400,
"step": 13700
},
{
"epoch": 0.44205266192581205,
"grad_norm": 0.6892516016960144,
"learning_rate": 4.768221257999373e-05,
"loss": 1.0308,
"num_input_tokens_seen": 113049600,
"step": 13800
},
{
"epoch": 0.4452559420846947,
"grad_norm": 0.6279174089431763,
"learning_rate": 4.764643709286386e-05,
"loss": 1.057,
"num_input_tokens_seen": 113868800,
"step": 13900
},
{
"epoch": 0.4484592222435774,
"grad_norm": 0.6180372834205627,
"learning_rate": 4.761040124776904e-05,
"loss": 1.0059,
"num_input_tokens_seen": 114688000,
"step": 14000
},
{
"epoch": 0.45166250240246014,
"grad_norm": 0.6153070330619812,
"learning_rate": 4.757410545900047e-05,
"loss": 1.0717,
"num_input_tokens_seen": 115507200,
"step": 14100
},
{
"epoch": 0.4548657825613428,
"grad_norm": 0.5821653604507446,
"learning_rate": 4.7537550143837796e-05,
"loss": 1.0313,
"num_input_tokens_seen": 116326400,
"step": 14200
},
{
"epoch": 0.4580690627202255,
"grad_norm": 0.5773714780807495,
"learning_rate": 4.750073572254438e-05,
"loss": 1.0296,
"num_input_tokens_seen": 117145600,
"step": 14300
},
{
"epoch": 0.46127234287910823,
"grad_norm": 0.7084370255470276,
"learning_rate": 4.746366261836242e-05,
"loss": 0.9977,
"num_input_tokens_seen": 117964800,
"step": 14400
},
{
"epoch": 0.4644756230379909,
"grad_norm": 0.719439685344696,
"learning_rate": 4.742633125750808e-05,
"loss": 0.9753,
"num_input_tokens_seen": 118784000,
"step": 14500
},
{
"epoch": 0.4676789031968736,
"grad_norm": 0.6266898512840271,
"learning_rate": 4.738874206916665e-05,
"loss": 0.9722,
"num_input_tokens_seen": 119603200,
"step": 14600
},
{
"epoch": 0.47088218335575627,
"grad_norm": 0.6483869552612305,
"learning_rate": 4.7350895485487526e-05,
"loss": 1.066,
"num_input_tokens_seen": 120422400,
"step": 14700
},
{
"epoch": 0.474085463514639,
"grad_norm": 0.5138384699821472,
"learning_rate": 4.731279194157933e-05,
"loss": 0.973,
"num_input_tokens_seen": 121241600,
"step": 14800
},
{
"epoch": 0.4772887436735217,
"grad_norm": 0.6580103039741516,
"learning_rate": 4.727443187550481e-05,
"loss": 0.9922,
"num_input_tokens_seen": 122060800,
"step": 14900
},
{
"epoch": 0.48049202383240436,
"grad_norm": 0.6680930852890015,
"learning_rate": 4.723581572827592e-05,
"loss": 0.9851,
"num_input_tokens_seen": 122880000,
"step": 15000
},
{
"epoch": 0.4836953039912871,
"grad_norm": 2.329383373260498,
"learning_rate": 4.719694394384863e-05,
"loss": 1.0284,
"num_input_tokens_seen": 123699200,
"step": 15100
},
{
"epoch": 0.4868985841501698,
"grad_norm": 0.7416221499443054,
"learning_rate": 4.715781696911792e-05,
"loss": 0.9828,
"num_input_tokens_seen": 124518400,
"step": 15200
},
{
"epoch": 0.49010186430905245,
"grad_norm": 0.5373809337615967,
"learning_rate": 4.7118435253912575e-05,
"loss": 0.9621,
"num_input_tokens_seen": 125337600,
"step": 15300
},
{
"epoch": 0.49330514446793516,
"grad_norm": 0.5429302453994751,
"learning_rate": 4.7078799250990056e-05,
"loss": 1.013,
"num_input_tokens_seen": 126156800,
"step": 15400
},
{
"epoch": 0.4965084246268179,
"grad_norm": 0.5449560284614563,
"learning_rate": 4.7038909416031276e-05,
"loss": 1.0564,
"num_input_tokens_seen": 126976000,
"step": 15500
},
{
"epoch": 0.49971170478570054,
"grad_norm": 0.6629030704498291,
"learning_rate": 4.699876620763535e-05,
"loss": 0.9828,
"num_input_tokens_seen": 127795200,
"step": 15600
},
{
"epoch": 0.5029149849445832,
"grad_norm": 0.6022646427154541,
"learning_rate": 4.6958370087314344e-05,
"loss": 1.0435,
"num_input_tokens_seen": 128614400,
"step": 15700
},
{
"epoch": 0.5061182651034659,
"grad_norm": 1.8832833766937256,
"learning_rate": 4.691772151948799e-05,
"loss": 0.9438,
"num_input_tokens_seen": 129433600,
"step": 15800
},
{
"epoch": 0.5093215452623486,
"grad_norm": 0.7114049196243286,
"learning_rate": 4.687682097147826e-05,
"loss": 0.947,
"num_input_tokens_seen": 130252800,
"step": 15900
},
{
"epoch": 0.5125248254212313,
"grad_norm": 1.7428299188613892,
"learning_rate": 4.683566891350412e-05,
"loss": 0.9461,
"num_input_tokens_seen": 131072000,
"step": 16000
},
{
"epoch": 0.5157281055801141,
"grad_norm": 0.7306798100471497,
"learning_rate": 4.679426581867599e-05,
"loss": 0.9964,
"num_input_tokens_seen": 131891200,
"step": 16100
},
{
"epoch": 0.5189313857389968,
"grad_norm": 0.6088542938232422,
"learning_rate": 4.675261216299042e-05,
"loss": 0.9499,
"num_input_tokens_seen": 132710400,
"step": 16200
},
{
"epoch": 0.5221346658978794,
"grad_norm": 1.0487473011016846,
"learning_rate": 4.6710708425324545e-05,
"loss": 1.0205,
"num_input_tokens_seen": 133529600,
"step": 16300
},
{
"epoch": 0.5253379460567621,
"grad_norm": 0.4886884093284607,
"learning_rate": 4.6668555087430605e-05,
"loss": 0.9996,
"num_input_tokens_seen": 134348800,
"step": 16400
},
{
"epoch": 0.5285412262156448,
"grad_norm": 0.8639355301856995,
"learning_rate": 4.662615263393041e-05,
"loss": 1.0013,
"num_input_tokens_seen": 135168000,
"step": 16500
},
{
"epoch": 0.5317445063745275,
"grad_norm": 2.132063865661621,
"learning_rate": 4.658350155230976e-05,
"loss": 1.0437,
"num_input_tokens_seen": 135987200,
"step": 16600
},
{
"epoch": 0.5349477865334102,
"grad_norm": 0.5800316333770752,
"learning_rate": 4.6540602332912854e-05,
"loss": 1.0094,
"num_input_tokens_seen": 136806400,
"step": 16700
},
{
"epoch": 0.538151066692293,
"grad_norm": 0.48361486196517944,
"learning_rate": 4.6497455468936606e-05,
"loss": 1.0141,
"num_input_tokens_seen": 137625600,
"step": 16800
},
{
"epoch": 0.5413543468511756,
"grad_norm": 0.5760986804962158,
"learning_rate": 4.645406145642506e-05,
"loss": 1.0359,
"num_input_tokens_seen": 138444800,
"step": 16900
},
{
"epoch": 0.5445576270100583,
"grad_norm": 0.42741426825523376,
"learning_rate": 4.64104207942636e-05,
"loss": 0.9605,
"num_input_tokens_seen": 139264000,
"step": 17000
},
{
"epoch": 0.547760907168941,
"grad_norm": 0.6151024103164673,
"learning_rate": 4.6366533984173274e-05,
"loss": 0.9502,
"num_input_tokens_seen": 140083200,
"step": 17100
},
{
"epoch": 0.5509641873278237,
"grad_norm": 5.775717735290527,
"learning_rate": 4.6322401530704995e-05,
"loss": 1.016,
"num_input_tokens_seen": 140902400,
"step": 17200
},
{
"epoch": 0.5541674674867064,
"grad_norm": 0.5886793732643127,
"learning_rate": 4.627802394123375e-05,
"loss": 1.0039,
"num_input_tokens_seen": 141721600,
"step": 17300
},
{
"epoch": 0.5573707476455891,
"grad_norm": 2.4064829349517822,
"learning_rate": 4.623340172595277e-05,
"loss": 0.9972,
"num_input_tokens_seen": 142540800,
"step": 17400
},
{
"epoch": 0.5605740278044717,
"grad_norm": 0.5964205861091614,
"learning_rate": 4.6188535397867675e-05,
"loss": 0.9894,
"num_input_tokens_seen": 143360000,
"step": 17500
},
{
"epoch": 0.5637773079633545,
"grad_norm": 0.5683798789978027,
"learning_rate": 4.614342547279052e-05,
"loss": 1.0721,
"num_input_tokens_seen": 144179200,
"step": 17600
},
{
"epoch": 0.5669805881222372,
"grad_norm": 0.5441416501998901,
"learning_rate": 4.609807246933395e-05,
"loss": 1.0183,
"num_input_tokens_seen": 144998400,
"step": 17700
},
{
"epoch": 0.5701838682811199,
"grad_norm": 2.547898530960083,
"learning_rate": 4.605247690890518e-05,
"loss": 1.0083,
"num_input_tokens_seen": 145817600,
"step": 17800
},
{
"epoch": 0.5733871484400026,
"grad_norm": 0.7640330791473389,
"learning_rate": 4.600663931570001e-05,
"loss": 0.9927,
"num_input_tokens_seen": 146636800,
"step": 17900
},
{
"epoch": 0.5765904285988852,
"grad_norm": 0.6045035123825073,
"learning_rate": 4.596056021669681e-05,
"loss": 1.0144,
"num_input_tokens_seen": 147456000,
"step": 18000
},
{
"epoch": 0.5797937087577679,
"grad_norm": 0.5718028545379639,
"learning_rate": 4.591424014165047e-05,
"loss": 1.0417,
"num_input_tokens_seen": 148275200,
"step": 18100
},
{
"epoch": 0.5829969889166506,
"grad_norm": 0.49183499813079834,
"learning_rate": 4.586767962308625e-05,
"loss": 1.0124,
"num_input_tokens_seen": 149094400,
"step": 18200
},
{
"epoch": 0.5862002690755334,
"grad_norm": 0.5138664841651917,
"learning_rate": 4.5820879196293756e-05,
"loss": 0.9961,
"num_input_tokens_seen": 149913600,
"step": 18300
},
{
"epoch": 0.5894035492344161,
"grad_norm": 0.6507889628410339,
"learning_rate": 4.577383939932069e-05,
"loss": 1.0066,
"num_input_tokens_seen": 150732800,
"step": 18400
},
{
"epoch": 0.5926068293932988,
"grad_norm": 0.48219242691993713,
"learning_rate": 4.572656077296676e-05,
"loss": 1.0422,
"num_input_tokens_seen": 151552000,
"step": 18500
},
{
"epoch": 0.5958101095521814,
"grad_norm": 2.981851100921631,
"learning_rate": 4.567904386077734e-05,
"loss": 1.0647,
"num_input_tokens_seen": 152371200,
"step": 18600
},
{
"epoch": 0.5990133897110641,
"grad_norm": 1.6492716073989868,
"learning_rate": 4.563128920903735e-05,
"loss": 1.0465,
"num_input_tokens_seen": 153190400,
"step": 18700
},
{
"epoch": 0.6022166698699468,
"grad_norm": 0.6568962335586548,
"learning_rate": 4.558329736676488e-05,
"loss": 1.0505,
"num_input_tokens_seen": 154009600,
"step": 18800
},
{
"epoch": 0.6054199500288295,
"grad_norm": 0.77339768409729,
"learning_rate": 4.553506888570494e-05,
"loss": 1.0287,
"num_input_tokens_seen": 154828800,
"step": 18900
},
{
"epoch": 0.6086232301877122,
"grad_norm": 0.6354805827140808,
"learning_rate": 4.548660432032307e-05,
"loss": 0.9675,
"num_input_tokens_seen": 155648000,
"step": 19000
},
{
"epoch": 0.611826510346595,
"grad_norm": 0.6528341770172119,
"learning_rate": 4.5437904227799e-05,
"loss": 1.0027,
"num_input_tokens_seen": 156467200,
"step": 19100
},
{
"epoch": 0.6150297905054776,
"grad_norm": 0.7518653273582458,
"learning_rate": 4.538896916802023e-05,
"loss": 1.0002,
"num_input_tokens_seen": 157286400,
"step": 19200
},
{
"epoch": 0.6182330706643603,
"grad_norm": 1.2601783275604248,
"learning_rate": 4.533979970357558e-05,
"loss": 1.0698,
"num_input_tokens_seen": 158105600,
"step": 19300
},
{
"epoch": 0.621436350823243,
"grad_norm": 0.7242873311042786,
"learning_rate": 4.529039639974876e-05,
"loss": 0.9834,
"num_input_tokens_seen": 158924800,
"step": 19400
},
{
"epoch": 0.6246396309821257,
"grad_norm": 2.0396833419799805,
"learning_rate": 4.524075982451183e-05,
"loss": 0.9634,
"num_input_tokens_seen": 159744000,
"step": 19500
},
{
"epoch": 0.6278429111410084,
"grad_norm": 2.7037477493286133,
"learning_rate": 4.5190890548518696e-05,
"loss": 1.0221,
"num_input_tokens_seen": 160563200,
"step": 19600
},
{
"epoch": 0.631046191299891,
"grad_norm": 1.6231496334075928,
"learning_rate": 4.5140789145098536e-05,
"loss": 1.0582,
"num_input_tokens_seen": 161382400,
"step": 19700
},
{
"epoch": 0.6342494714587738,
"grad_norm": 0.6004766225814819,
"learning_rate": 4.509045619024921e-05,
"loss": 1.0112,
"num_input_tokens_seen": 162201600,
"step": 19800
},
{
"epoch": 0.6374527516176565,
"grad_norm": 12.123788833618164,
"learning_rate": 4.5039892262630656e-05,
"loss": 1.0078,
"num_input_tokens_seen": 163020800,
"step": 19900
},
{
"epoch": 0.6406560317765392,
"grad_norm": 3.2375683784484863,
"learning_rate": 4.498909794355821e-05,
"loss": 1.0239,
"num_input_tokens_seen": 163840000,
"step": 20000
},
{
"epoch": 0.6438593119354219,
"grad_norm": 0.8260817527770996,
"learning_rate": 4.493807381699595e-05,
"loss": 1.009,
"num_input_tokens_seen": 164659200,
"step": 20100
},
{
"epoch": 0.6470625920943046,
"grad_norm": 0.7712699174880981,
"learning_rate": 4.488682046954994e-05,
"loss": 0.9565,
"num_input_tokens_seen": 165478400,
"step": 20200
},
{
"epoch": 0.6502658722531872,
"grad_norm": 0.5889214277267456,
"learning_rate": 4.483533849046155e-05,
"loss": 1.0225,
"num_input_tokens_seen": 166297600,
"step": 20300
},
{
"epoch": 0.6534691524120699,
"grad_norm": 1.2388112545013428,
"learning_rate": 4.4783628471600636e-05,
"loss": 1.0642,
"num_input_tokens_seen": 167116800,
"step": 20400
},
{
"epoch": 0.6566724325709526,
"grad_norm": 0.6664971709251404,
"learning_rate": 4.473169100745871e-05,
"loss": 0.9598,
"num_input_tokens_seen": 167936000,
"step": 20500
},
{
"epoch": 0.6598757127298354,
"grad_norm": 0.5350831151008606,
"learning_rate": 4.4679526695142195e-05,
"loss": 1.0391,
"num_input_tokens_seen": 168755200,
"step": 20600
},
{
"epoch": 0.6630789928887181,
"grad_norm": 0.6643035411834717,
"learning_rate": 4.4627136134365463e-05,
"loss": 0.998,
"num_input_tokens_seen": 169574400,
"step": 20700
},
{
"epoch": 0.6662822730476008,
"grad_norm": 0.5972053408622742,
"learning_rate": 4.457451992744402e-05,
"loss": 1.0335,
"num_input_tokens_seen": 170393600,
"step": 20800
},
{
"epoch": 0.6694855532064834,
"grad_norm": 0.5102434754371643,
"learning_rate": 4.452167867928751e-05,
"loss": 1.0459,
"num_input_tokens_seen": 171212800,
"step": 20900
},
{
"epoch": 0.6726888333653661,
"grad_norm": 0.5346103310585022,
"learning_rate": 4.4468612997392824e-05,
"loss": 0.9922,
"num_input_tokens_seen": 172032000,
"step": 21000
},
{
"epoch": 0.6758921135242488,
"grad_norm": 0.5129193663597107,
"learning_rate": 4.441532349183706e-05,
"loss": 1.0024,
"num_input_tokens_seen": 172851200,
"step": 21100
},
{
"epoch": 0.6790953936831315,
"grad_norm": 0.5462967753410339,
"learning_rate": 4.4361810775270554e-05,
"loss": 0.994,
"num_input_tokens_seen": 173670400,
"step": 21200
},
{
"epoch": 0.6822986738420143,
"grad_norm": 1.2343724966049194,
"learning_rate": 4.430807546290982e-05,
"loss": 0.9669,
"num_input_tokens_seen": 174489600,
"step": 21300
},
{
"epoch": 0.685501954000897,
"grad_norm": 0.653947651386261,
"learning_rate": 4.425411817253048e-05,
"loss": 1.0029,
"num_input_tokens_seen": 175308800,
"step": 21400
},
{
"epoch": 0.6887052341597796,
"grad_norm": 2.948323965072632,
"learning_rate": 4.419993952446013e-05,
"loss": 1.0158,
"num_input_tokens_seen": 176128000,
"step": 21500
},
{
"epoch": 0.6919085143186623,
"grad_norm": 1.577588438987732,
"learning_rate": 4.414554014157127e-05,
"loss": 1.0571,
"num_input_tokens_seen": 176947200,
"step": 21600
},
{
"epoch": 0.695111794477545,
"grad_norm": 1.0136100053787231,
"learning_rate": 4.4090920649274095e-05,
"loss": 0.9647,
"num_input_tokens_seen": 177766400,
"step": 21700
},
{
"epoch": 0.6983150746364277,
"grad_norm": 0.5571495294570923,
"learning_rate": 4.40360816755093e-05,
"loss": 0.9609,
"num_input_tokens_seen": 178585600,
"step": 21800
},
{
"epoch": 0.7015183547953104,
"grad_norm": 0.5548049211502075,
"learning_rate": 4.3981023850740926e-05,
"loss": 0.9524,
"num_input_tokens_seen": 179404800,
"step": 21900
},
{
"epoch": 0.704721634954193,
"grad_norm": 0.9693801999092102,
"learning_rate": 4.392574780794901e-05,
"loss": 0.9641,
"num_input_tokens_seen": 180224000,
"step": 22000
},
{
"epoch": 0.7079249151130758,
"grad_norm": 0.6628372669219971,
"learning_rate": 4.387025418262242e-05,
"loss": 0.9838,
"num_input_tokens_seen": 181043200,
"step": 22100
},
{
"epoch": 0.7111281952719585,
"grad_norm": 0.5312179923057556,
"learning_rate": 4.381454361275143e-05,
"loss": 1.0309,
"num_input_tokens_seen": 181862400,
"step": 22200
},
{
"epoch": 0.7143314754308412,
"grad_norm": 0.6137087941169739,
"learning_rate": 4.3758616738820506e-05,
"loss": 1.0029,
"num_input_tokens_seen": 182681600,
"step": 22300
},
{
"epoch": 0.7175347555897239,
"grad_norm": 1.6591495275497437,
"learning_rate": 4.370247420380085e-05,
"loss": 0.9842,
"num_input_tokens_seen": 183500800,
"step": 22400
},
{
"epoch": 0.7207380357486066,
"grad_norm": 0.677762508392334,
"learning_rate": 4.3646116653143046e-05,
"loss": 0.9606,
"num_input_tokens_seen": 184320000,
"step": 22500
},
{
"epoch": 0.7239413159074892,
"grad_norm": 0.602687418460846,
"learning_rate": 4.358954473476965e-05,
"loss": 0.9781,
"num_input_tokens_seen": 185139200,
"step": 22600
},
{
"epoch": 0.7271445960663719,
"grad_norm": 0.5638014674186707,
"learning_rate": 4.353275909906772e-05,
"loss": 0.9823,
"num_input_tokens_seen": 185958400,
"step": 22700
},
{
"epoch": 0.7303478762252547,
"grad_norm": 1.6680676937103271,
"learning_rate": 4.3475760398881325e-05,
"loss": 0.988,
"num_input_tokens_seen": 186777600,
"step": 22800
},
{
"epoch": 0.7335511563841374,
"grad_norm": 0.6449896097183228,
"learning_rate": 4.3418549289504096e-05,
"loss": 0.9878,
"num_input_tokens_seen": 187596800,
"step": 22900
},
{
"epoch": 0.7367544365430201,
"grad_norm": 2.6768717765808105,
"learning_rate": 4.3361126428671636e-05,
"loss": 1.0091,
"num_input_tokens_seen": 188416000,
"step": 23000
},
{
"epoch": 0.7399577167019028,
"grad_norm": 1.079026460647583,
"learning_rate": 4.330349247655398e-05,
"loss": 1.0383,
"num_input_tokens_seen": 189235200,
"step": 23100
},
{
"epoch": 0.7431609968607854,
"grad_norm": 0.6426740288734436,
"learning_rate": 4.324564809574799e-05,
"loss": 0.9801,
"num_input_tokens_seen": 190054400,
"step": 23200
},
{
"epoch": 0.7463642770196681,
"grad_norm": 0.8264270424842834,
"learning_rate": 4.318759395126979e-05,
"loss": 1.0095,
"num_input_tokens_seen": 190873600,
"step": 23300
},
{
"epoch": 0.7495675571785508,
"grad_norm": 0.5160927176475525,
"learning_rate": 4.3129330710547035e-05,
"loss": 0.9601,
"num_input_tokens_seen": 191692800,
"step": 23400
},
{
"epoch": 0.7527708373374336,
"grad_norm": 0.6011959910392761,
"learning_rate": 4.307085904341133e-05,
"loss": 0.9837,
"num_input_tokens_seen": 192512000,
"step": 23500
},
{
"epoch": 0.7559741174963163,
"grad_norm": 0.5961838960647583,
"learning_rate": 4.3012179622090436e-05,
"loss": 0.9647,
"num_input_tokens_seen": 193331200,
"step": 23600
},
{
"epoch": 0.7591773976551989,
"grad_norm": 0.8201313614845276,
"learning_rate": 4.295329312120063e-05,
"loss": 0.9439,
"num_input_tokens_seen": 194150400,
"step": 23700
},
{
"epoch": 0.7623806778140816,
"grad_norm": 0.5474829077720642,
"learning_rate": 4.289420021773889e-05,
"loss": 0.9708,
"num_input_tokens_seen": 194969600,
"step": 23800
},
{
"epoch": 0.7655839579729643,
"grad_norm": 0.5124524235725403,
"learning_rate": 4.283490159107513e-05,
"loss": 1.0109,
"num_input_tokens_seen": 195788800,
"step": 23900
},
{
"epoch": 0.768787238131847,
"grad_norm": 0.6800445318222046,
"learning_rate": 4.27753979229444e-05,
"loss": 1.0119,
"num_input_tokens_seen": 196608000,
"step": 24000
},
{
"epoch": 0.7719905182907297,
"grad_norm": 0.5350146889686584,
"learning_rate": 4.271568989743903e-05,
"loss": 0.9659,
"num_input_tokens_seen": 197427200,
"step": 24100
},
{
"epoch": 0.7751937984496124,
"grad_norm": 0.6650831699371338,
"learning_rate": 4.265577820100076e-05,
"loss": 0.9729,
"num_input_tokens_seen": 198246400,
"step": 24200
},
{
"epoch": 0.778397078608495,
"grad_norm": 0.5228304862976074,
"learning_rate": 4.2595663522412884e-05,
"loss": 0.9633,
"num_input_tokens_seen": 199065600,
"step": 24300
},
{
"epoch": 0.7816003587673778,
"grad_norm": 0.532375693321228,
"learning_rate": 4.253534655279232e-05,
"loss": 0.9687,
"num_input_tokens_seen": 199884800,
"step": 24400
},
{
"epoch": 0.7848036389262605,
"grad_norm": 0.8860092759132385,
"learning_rate": 4.247482798558161e-05,
"loss": 1.0017,
"num_input_tokens_seen": 200704000,
"step": 24500
},
{
"epoch": 0.7880069190851432,
"grad_norm": 2.975177526473999,
"learning_rate": 4.241410851654102e-05,
"loss": 0.9905,
"num_input_tokens_seen": 201523200,
"step": 24600
},
{
"epoch": 0.7912101992440259,
"grad_norm": 0.622031033039093,
"learning_rate": 4.235318884374051e-05,
"loss": 1.0358,
"num_input_tokens_seen": 202342400,
"step": 24700
},
{
"epoch": 0.7944134794029086,
"grad_norm": 1.7574553489685059,
"learning_rate": 4.229206966755172e-05,
"loss": 1.0105,
"num_input_tokens_seen": 203161600,
"step": 24800
},
{
"epoch": 0.7976167595617912,
"grad_norm": 0.7439371347427368,
"learning_rate": 4.223075169063989e-05,
"loss": 0.9345,
"num_input_tokens_seen": 203980800,
"step": 24900
},
{
"epoch": 0.800820039720674,
"grad_norm": 0.5452560782432556,
"learning_rate": 4.21692356179558e-05,
"loss": 0.9655,
"num_input_tokens_seen": 204800000,
"step": 25000
},
{
"epoch": 0.8040233198795567,
"grad_norm": 0.5876986384391785,
"learning_rate": 4.210752215672769e-05,
"loss": 0.949,
"num_input_tokens_seen": 205619200,
"step": 25100
},
{
"epoch": 0.8072266000384394,
"grad_norm": 2.6809980869293213,
"learning_rate": 4.204561201645307e-05,
"loss": 1.0082,
"num_input_tokens_seen": 206438400,
"step": 25200
},
{
"epoch": 0.8104298801973221,
"grad_norm": 0.647762656211853,
"learning_rate": 4.198350590889064e-05,
"loss": 1.0074,
"num_input_tokens_seen": 207257600,
"step": 25300
},
{
"epoch": 0.8136331603562047,
"grad_norm": 0.4822922945022583,
"learning_rate": 4.192120454805203e-05,
"loss": 0.9638,
"num_input_tokens_seen": 208076800,
"step": 25400
},
{
"epoch": 0.8168364405150874,
"grad_norm": 9.964862823486328,
"learning_rate": 4.185870865019364e-05,
"loss": 0.9793,
"num_input_tokens_seen": 208896000,
"step": 25500
},
{
"epoch": 0.8200397206739701,
"grad_norm": 0.6270651817321777,
"learning_rate": 4.17960189338084e-05,
"loss": 0.9515,
"num_input_tokens_seen": 209715200,
"step": 25600
},
{
"epoch": 0.8232430008328528,
"grad_norm": 0.5813098549842834,
"learning_rate": 4.17331361196175e-05,
"loss": 0.9659,
"num_input_tokens_seen": 210534400,
"step": 25700
},
{
"epoch": 0.8264462809917356,
"grad_norm": 0.5864317417144775,
"learning_rate": 4.167006093056209e-05,
"loss": 1.0496,
"num_input_tokens_seen": 211353600,
"step": 25800
},
{
"epoch": 0.8296495611506183,
"grad_norm": 2.7955405712127686,
"learning_rate": 4.1606794091795e-05,
"loss": 0.9466,
"num_input_tokens_seen": 212172800,
"step": 25900
},
{
"epoch": 0.8328528413095009,
"grad_norm": 0.5431935787200928,
"learning_rate": 4.154333633067238e-05,
"loss": 0.9308,
"num_input_tokens_seen": 212992000,
"step": 26000
},
{
"epoch": 0.8360561214683836,
"grad_norm": 2.313504934310913,
"learning_rate": 4.147968837674535e-05,
"loss": 0.9996,
"num_input_tokens_seen": 213811200,
"step": 26100
},
{
"epoch": 0.8392594016272663,
"grad_norm": 0.6028672456741333,
"learning_rate": 4.141585096175162e-05,
"loss": 0.9862,
"num_input_tokens_seen": 214630400,
"step": 26200
},
{
"epoch": 0.842462681786149,
"grad_norm": 1.6038614511489868,
"learning_rate": 4.1351824819607056e-05,
"loss": 1.0175,
"num_input_tokens_seen": 215449600,
"step": 26300
},
{
"epoch": 0.8456659619450317,
"grad_norm": 0.6132040619850159,
"learning_rate": 4.128761068639723e-05,
"loss": 0.9903,
"num_input_tokens_seen": 216268800,
"step": 26400
},
{
"epoch": 0.8488692421039145,
"grad_norm": 1.7026666402816772,
"learning_rate": 4.122320930036902e-05,
"loss": 1.0261,
"num_input_tokens_seen": 217088000,
"step": 26500
},
{
"epoch": 0.8520725222627971,
"grad_norm": 0.6355572938919067,
"learning_rate": 4.1158621401922046e-05,
"loss": 1.0048,
"num_input_tokens_seen": 217907200,
"step": 26600
},
{
"epoch": 0.8552758024216798,
"grad_norm": 0.683513879776001,
"learning_rate": 4.109384773360023e-05,
"loss": 0.9659,
"num_input_tokens_seen": 218726400,
"step": 26700
},
{
"epoch": 0.8584790825805625,
"grad_norm": 0.6867396831512451,
"learning_rate": 4.10288890400832e-05,
"loss": 1.0134,
"num_input_tokens_seen": 219545600,
"step": 26800
},
{
"epoch": 0.8616823627394452,
"grad_norm": 0.4578529894351959,
"learning_rate": 4.0963746068177744e-05,
"loss": 1.0011,
"num_input_tokens_seen": 220364800,
"step": 26900
},
{
"epoch": 0.8648856428983279,
"grad_norm": 0.5275700688362122,
"learning_rate": 4.089841956680927e-05,
"loss": 1.0777,
"num_input_tokens_seen": 221184000,
"step": 27000
},
{
"epoch": 0.8680889230572106,
"grad_norm": 0.5704593658447266,
"learning_rate": 4.08329102870131e-05,
"loss": 1.0113,
"num_input_tokens_seen": 222003200,
"step": 27100
},
{
"epoch": 0.8712922032160932,
"grad_norm": 0.5546739101409912,
"learning_rate": 4.076721898192597e-05,
"loss": 1.0181,
"num_input_tokens_seen": 222822400,
"step": 27200
},
{
"epoch": 0.874495483374976,
"grad_norm": 0.4796381890773773,
"learning_rate": 4.070134640677722e-05,
"loss": 0.9882,
"num_input_tokens_seen": 223641600,
"step": 27300
},
{
"epoch": 0.8776987635338587,
"grad_norm": 8.13311767578125,
"learning_rate": 4.063529331888024e-05,
"loss": 0.9378,
"num_input_tokens_seen": 224460800,
"step": 27400
},
{
"epoch": 0.8809020436927414,
"grad_norm": 0.4969484806060791,
"learning_rate": 4.056906047762368e-05,
"loss": 0.9867,
"num_input_tokens_seen": 225280000,
"step": 27500
},
{
"epoch": 0.8841053238516241,
"grad_norm": 3.9572601318359375,
"learning_rate": 4.0502648644462774e-05,
"loss": 0.9645,
"num_input_tokens_seen": 226099200,
"step": 27600
},
{
"epoch": 0.8873086040105067,
"grad_norm": 2.1928722858428955,
"learning_rate": 4.043605858291053e-05,
"loss": 0.9678,
"num_input_tokens_seen": 226918400,
"step": 27700
},
{
"epoch": 0.8905118841693894,
"grad_norm": 0.7099782824516296,
"learning_rate": 4.036929105852901e-05,
"loss": 1.0127,
"num_input_tokens_seen": 227737600,
"step": 27800
},
{
"epoch": 0.8937151643282721,
"grad_norm": 0.6126459836959839,
"learning_rate": 4.0302346838920514e-05,
"loss": 1.0439,
"num_input_tokens_seen": 228556800,
"step": 27900
},
{
"epoch": 0.8969184444871549,
"grad_norm": 0.6163774728775024,
"learning_rate": 4.02352266937187e-05,
"loss": 0.9393,
"num_input_tokens_seen": 229376000,
"step": 28000
},
{
"epoch": 0.9001217246460376,
"grad_norm": 0.6306945085525513,
"learning_rate": 4.016793139457982e-05,
"loss": 0.8966,
"num_input_tokens_seen": 230195200,
"step": 28100
},
{
"epoch": 0.9033250048049203,
"grad_norm": 0.6520447134971619,
"learning_rate": 4.0100461715173777e-05,
"loss": 0.9861,
"num_input_tokens_seen": 231014400,
"step": 28200
},
{
"epoch": 0.9065282849638029,
"grad_norm": 0.5960193276405334,
"learning_rate": 4.003281843117528e-05,
"loss": 1.0012,
"num_input_tokens_seen": 231833600,
"step": 28300
},
{
"epoch": 0.9097315651226856,
"grad_norm": 0.6080912947654724,
"learning_rate": 3.9965002320254924e-05,
"loss": 0.9602,
"num_input_tokens_seen": 232652800,
"step": 28400
},
{
"epoch": 0.9129348452815683,
"grad_norm": 0.6659435033798218,
"learning_rate": 3.989701416207019e-05,
"loss": 0.988,
"num_input_tokens_seen": 233472000,
"step": 28500
},
{
"epoch": 0.916138125440451,
"grad_norm": 2.5207667350769043,
"learning_rate": 3.9828854738256564e-05,
"loss": 1.0339,
"num_input_tokens_seen": 234291200,
"step": 28600
},
{
"epoch": 0.9193414055993337,
"grad_norm": 2.4952239990234375,
"learning_rate": 3.976052483241849e-05,
"loss": 1.0025,
"num_input_tokens_seen": 235110400,
"step": 28700
},
{
"epoch": 0.9225446857582165,
"grad_norm": 0.6766204237937927,
"learning_rate": 3.969202523012038e-05,
"loss": 1.0335,
"num_input_tokens_seen": 235929600,
"step": 28800
},
{
"epoch": 0.9257479659170991,
"grad_norm": 0.666861891746521,
"learning_rate": 3.9623356718877605e-05,
"loss": 0.9721,
"num_input_tokens_seen": 236748800,
"step": 28900
},
{
"epoch": 0.9289512460759818,
"grad_norm": 0.5322718620300293,
"learning_rate": 3.955452008814741e-05,
"loss": 0.9866,
"num_input_tokens_seen": 237568000,
"step": 29000
},
{
"epoch": 0.9321545262348645,
"grad_norm": 0.6603706479072571,
"learning_rate": 3.9485516129319844e-05,
"loss": 0.9863,
"num_input_tokens_seen": 238387200,
"step": 29100
},
{
"epoch": 0.9353578063937472,
"grad_norm": 0.6650800704956055,
"learning_rate": 3.9416345635708676e-05,
"loss": 0.9902,
"num_input_tokens_seen": 239206400,
"step": 29200
},
{
"epoch": 0.9385610865526299,
"grad_norm": 2.477098226547241,
"learning_rate": 3.9347009402542256e-05,
"loss": 0.991,
"num_input_tokens_seen": 240025600,
"step": 29300
},
{
"epoch": 0.9417643667115125,
"grad_norm": 0.6523051261901855,
"learning_rate": 3.9277508226954394e-05,
"loss": 0.9851,
"num_input_tokens_seen": 240844800,
"step": 29400
},
{
"epoch": 0.9449676468703953,
"grad_norm": 0.7197608351707458,
"learning_rate": 3.920784290797519e-05,
"loss": 1.0144,
"num_input_tokens_seen": 241664000,
"step": 29500
},
{
"epoch": 0.948170927029278,
"grad_norm": 0.6857073903083801,
"learning_rate": 3.9138014246521806e-05,
"loss": 0.9529,
"num_input_tokens_seen": 242483200,
"step": 29600
},
{
"epoch": 0.9513742071881607,
"grad_norm": 0.616074800491333,
"learning_rate": 3.906802304538935e-05,
"loss": 0.9949,
"num_input_tokens_seen": 243302400,
"step": 29700
},
{
"epoch": 0.9545774873470434,
"grad_norm": 0.5982092022895813,
"learning_rate": 3.899787010924152e-05,
"loss": 0.9596,
"num_input_tokens_seen": 244121600,
"step": 29800
},
{
"epoch": 0.9577807675059261,
"grad_norm": 0.6943311095237732,
"learning_rate": 3.8927556244601495e-05,
"loss": 0.9813,
"num_input_tokens_seen": 244940800,
"step": 29900
},
{
"epoch": 0.9609840476648087,
"grad_norm": 0.7715808153152466,
"learning_rate": 3.885708225984254e-05,
"loss": 0.9747,
"num_input_tokens_seen": 245760000,
"step": 30000
},
{
"epoch": 0.9641873278236914,
"grad_norm": 0.6129135489463806,
"learning_rate": 3.878644896517879e-05,
"loss": 0.9933,
"num_input_tokens_seen": 246579200,
"step": 30100
},
{
"epoch": 0.9673906079825741,
"grad_norm": 0.7009174227714539,
"learning_rate": 3.87156571726559e-05,
"loss": 0.964,
"num_input_tokens_seen": 247398400,
"step": 30200
},
{
"epoch": 0.9705938881414569,
"grad_norm": 0.7255650758743286,
"learning_rate": 3.8644707696141704e-05,
"loss": 0.9784,
"num_input_tokens_seen": 248217600,
"step": 30300
},
{
"epoch": 0.9737971683003396,
"grad_norm": 4.299106597900391,
"learning_rate": 3.857360135131691e-05,
"loss": 1.0191,
"num_input_tokens_seen": 249036800,
"step": 30400
},
{
"epoch": 0.9770004484592223,
"grad_norm": 0.5924736261367798,
"learning_rate": 3.8502338955665644e-05,
"loss": 0.9769,
"num_input_tokens_seen": 249856000,
"step": 30500
},
{
"epoch": 0.9802037286181049,
"grad_norm": 0.7270549535751343,
"learning_rate": 3.843092132846613e-05,
"loss": 1.0179,
"num_input_tokens_seen": 250675200,
"step": 30600
},
{
"epoch": 0.9834070087769876,
"grad_norm": 0.7704394459724426,
"learning_rate": 3.835934929078119e-05,
"loss": 0.9206,
"num_input_tokens_seen": 251494400,
"step": 30700
},
{
"epoch": 0.9866102889358703,
"grad_norm": 0.612688422203064,
"learning_rate": 3.828762366544888e-05,
"loss": 0.9686,
"num_input_tokens_seen": 252313600,
"step": 30800
},
{
"epoch": 0.989813569094753,
"grad_norm": 0.5262284278869629,
"learning_rate": 3.8215745277073e-05,
"loss": 0.9694,
"num_input_tokens_seen": 253132800,
"step": 30900
},
{
"epoch": 0.9930168492536358,
"grad_norm": 0.5798372626304626,
"learning_rate": 3.8143714952013584e-05,
"loss": 0.8879,
"num_input_tokens_seen": 253952000,
"step": 31000
},
{
"epoch": 0.9962201294125185,
"grad_norm": 0.5605859756469727,
"learning_rate": 3.807153351837746e-05,
"loss": 0.9948,
"num_input_tokens_seen": 254771200,
"step": 31100
},
{
"epoch": 0.9994234095714011,
"grad_norm": 1.9532912969589233,
"learning_rate": 3.799920180600868e-05,
"loss": 1.027,
"num_input_tokens_seen": 255590400,
"step": 31200
},
{
"epoch": 1.0026266897302838,
"grad_norm": 0.6683017611503601,
"learning_rate": 3.792672064647898e-05,
"loss": 0.9665,
"num_input_tokens_seen": 256409600,
"step": 31300
},
{
"epoch": 1.0058299698891664,
"grad_norm": 0.5574291348457336,
"learning_rate": 3.785409087307828e-05,
"loss": 0.8671,
"num_input_tokens_seen": 257228800,
"step": 31400
},
{
"epoch": 1.0090332500480492,
"grad_norm": 0.6487427949905396,
"learning_rate": 3.778131332080503e-05,
"loss": 0.9356,
"num_input_tokens_seen": 258048000,
"step": 31500
},
{
"epoch": 1.0122365302069318,
"grad_norm": 0.6974719166755676,
"learning_rate": 3.7708388826356636e-05,
"loss": 0.9751,
"num_input_tokens_seen": 258867200,
"step": 31600
},
{
"epoch": 1.0154398103658147,
"grad_norm": 0.6754201054573059,
"learning_rate": 3.763531822811986e-05,
"loss": 0.8963,
"num_input_tokens_seen": 259686400,
"step": 31700
},
{
"epoch": 1.0186430905246973,
"grad_norm": 0.5839199423789978,
"learning_rate": 3.756210236616117e-05,
"loss": 0.9021,
"num_input_tokens_seen": 260505600,
"step": 31800
},
{
"epoch": 1.02184637068358,
"grad_norm": 0.5535345673561096,
"learning_rate": 3.7488742082217064e-05,
"loss": 0.947,
"num_input_tokens_seen": 261324800,
"step": 31900
},
{
"epoch": 1.0250496508424627,
"grad_norm": 1.948480248451233,
"learning_rate": 3.741523821968441e-05,
"loss": 0.9314,
"num_input_tokens_seen": 262144000,
"step": 32000
},
{
"epoch": 1.0282529310013453,
"grad_norm": 0.8400202393531799,
"learning_rate": 3.734159162361077e-05,
"loss": 0.9523,
"num_input_tokens_seen": 262963200,
"step": 32100
},
{
"epoch": 1.0314562111602281,
"grad_norm": 0.7016623020172119,
"learning_rate": 3.7267803140684635e-05,
"loss": 0.9119,
"num_input_tokens_seen": 263782400,
"step": 32200
},
{
"epoch": 1.0346594913191107,
"grad_norm": 0.6084064841270447,
"learning_rate": 3.719387361922573e-05,
"loss": 0.9027,
"num_input_tokens_seen": 264601600,
"step": 32300
},
{
"epoch": 1.0378627714779936,
"grad_norm": 1.551859736442566,
"learning_rate": 3.711980390917523e-05,
"loss": 0.9126,
"num_input_tokens_seen": 265420800,
"step": 32400
},
{
"epoch": 1.0410660516368762,
"grad_norm": 0.6663823127746582,
"learning_rate": 3.7045594862086065e-05,
"loss": 0.909,
"num_input_tokens_seen": 266240000,
"step": 32500
},
{
"epoch": 1.0442693317957588,
"grad_norm": 0.6280916333198547,
"learning_rate": 3.697124733111299e-05,
"loss": 0.8809,
"num_input_tokens_seen": 267059200,
"step": 32600
},
{
"epoch": 1.0474726119546416,
"grad_norm": 0.7370727062225342,
"learning_rate": 3.689676217100293e-05,
"loss": 0.9155,
"num_input_tokens_seen": 267878400,
"step": 32700
},
{
"epoch": 1.0506758921135242,
"grad_norm": 0.5798324942588806,
"learning_rate": 3.682214023808506e-05,
"loss": 0.9514,
"num_input_tokens_seen": 268697600,
"step": 32800
},
{
"epoch": 1.053879172272407,
"grad_norm": 0.6621294021606445,
"learning_rate": 3.674738239026097e-05,
"loss": 0.9057,
"num_input_tokens_seen": 269516800,
"step": 32900
},
{
"epoch": 1.0570824524312896,
"grad_norm": 0.9696263074874878,
"learning_rate": 3.667248948699482e-05,
"loss": 0.9083,
"num_input_tokens_seen": 270336000,
"step": 33000
},
{
"epoch": 1.0602857325901724,
"grad_norm": 1.3327863216400146,
"learning_rate": 3.659746238930345e-05,
"loss": 0.9211,
"num_input_tokens_seen": 271155200,
"step": 33100
},
{
"epoch": 1.063489012749055,
"grad_norm": 0.7066917419433594,
"learning_rate": 3.6522301959746514e-05,
"loss": 0.9384,
"num_input_tokens_seen": 271974400,
"step": 33200
},
{
"epoch": 1.0666922929079377,
"grad_norm": 0.6944926977157593,
"learning_rate": 3.6447009062416506e-05,
"loss": 0.9296,
"num_input_tokens_seen": 272793600,
"step": 33300
},
{
"epoch": 1.0698955730668205,
"grad_norm": 2.94767165184021,
"learning_rate": 3.637158456292885e-05,
"loss": 0.8913,
"num_input_tokens_seen": 273612800,
"step": 33400
},
{
"epoch": 1.073098853225703,
"grad_norm": 0.671801745891571,
"learning_rate": 3.629602932841199e-05,
"loss": 0.9251,
"num_input_tokens_seen": 274432000,
"step": 33500
},
{
"epoch": 1.076302133384586,
"grad_norm": 0.6639389991760254,
"learning_rate": 3.622034422749734e-05,
"loss": 0.9024,
"num_input_tokens_seen": 275251200,
"step": 33600
},
{
"epoch": 1.0795054135434685,
"grad_norm": 0.6131206154823303,
"learning_rate": 3.614453013030936e-05,
"loss": 0.8965,
"num_input_tokens_seen": 276070400,
"step": 33700
},
{
"epoch": 1.0827086937023511,
"grad_norm": 2.824341058731079,
"learning_rate": 3.606858790845555e-05,
"loss": 0.9058,
"num_input_tokens_seen": 276889600,
"step": 33800
},
{
"epoch": 1.085911973861234,
"grad_norm": 0.4830228388309479,
"learning_rate": 3.5992518435016376e-05,
"loss": 0.9052,
"num_input_tokens_seen": 277708800,
"step": 33900
},
{
"epoch": 1.0891152540201166,
"grad_norm": 0.49670127034187317,
"learning_rate": 3.59163225845353e-05,
"loss": 0.9027,
"num_input_tokens_seen": 278528000,
"step": 34000
},
{
"epoch": 1.0923185341789994,
"grad_norm": 0.7440226674079895,
"learning_rate": 3.584000123300869e-05,
"loss": 0.8947,
"num_input_tokens_seen": 279347200,
"step": 34100
},
{
"epoch": 1.095521814337882,
"grad_norm": 0.515023410320282,
"learning_rate": 3.576355525787576e-05,
"loss": 0.8998,
"num_input_tokens_seen": 280166400,
"step": 34200
},
{
"epoch": 1.0987250944967646,
"grad_norm": 0.8011521100997925,
"learning_rate": 3.5686985538008445e-05,
"loss": 0.8951,
"num_input_tokens_seen": 280985600,
"step": 34300
},
{
"epoch": 1.1019283746556474,
"grad_norm": 0.5452113151550293,
"learning_rate": 3.561029295370138e-05,
"loss": 0.9009,
"num_input_tokens_seen": 281804800,
"step": 34400
},
{
"epoch": 1.10513165481453,
"grad_norm": 0.8674356937408447,
"learning_rate": 3.5533478386661665e-05,
"loss": 0.9592,
"num_input_tokens_seen": 282624000,
"step": 34500
},
{
"epoch": 1.1083349349734128,
"grad_norm": 0.653605043888092,
"learning_rate": 3.545654271999886e-05,
"loss": 0.8587,
"num_input_tokens_seen": 283443200,
"step": 34600
},
{
"epoch": 1.1115382151322954,
"grad_norm": 0.5951905846595764,
"learning_rate": 3.5379486838214715e-05,
"loss": 0.906,
"num_input_tokens_seen": 284262400,
"step": 34700
},
{
"epoch": 1.1147414952911783,
"grad_norm": 0.6143243908882141,
"learning_rate": 3.530231162719307e-05,
"loss": 0.8925,
"num_input_tokens_seen": 285081600,
"step": 34800
},
{
"epoch": 1.1179447754500609,
"grad_norm": 0.569734513759613,
"learning_rate": 3.5225017974189644e-05,
"loss": 0.8922,
"num_input_tokens_seen": 285900800,
"step": 34900
},
{
"epoch": 1.1211480556089435,
"grad_norm": 1.6546896696090698,
"learning_rate": 3.5147606767821846e-05,
"loss": 0.884,
"num_input_tokens_seen": 286720000,
"step": 35000
},
{
"epoch": 1.1243513357678263,
"grad_norm": 0.7131773829460144,
"learning_rate": 3.507007889805856e-05,
"loss": 0.8941,
"num_input_tokens_seen": 287539200,
"step": 35100
},
{
"epoch": 1.127554615926709,
"grad_norm": 1.8620835542678833,
"learning_rate": 3.499243525620988e-05,
"loss": 0.9209,
"num_input_tokens_seen": 288358400,
"step": 35200
},
{
"epoch": 1.1307578960855917,
"grad_norm": 1.936231017112732,
"learning_rate": 3.491467673491692e-05,
"loss": 0.9284,
"num_input_tokens_seen": 289177600,
"step": 35300
},
{
"epoch": 1.1339611762444743,
"grad_norm": 0.5847631096839905,
"learning_rate": 3.483680422814152e-05,
"loss": 0.9036,
"num_input_tokens_seen": 289996800,
"step": 35400
},
{
"epoch": 1.137164456403357,
"grad_norm": 0.6272117495536804,
"learning_rate": 3.4758818631155934e-05,
"loss": 0.8766,
"num_input_tokens_seen": 290816000,
"step": 35500
},
{
"epoch": 1.1403677365622398,
"grad_norm": 0.50895756483078,
"learning_rate": 3.4680720840532636e-05,
"loss": 0.8996,
"num_input_tokens_seen": 291635200,
"step": 35600
},
{
"epoch": 1.1435710167211224,
"grad_norm": 0.8421196341514587,
"learning_rate": 3.460251175413388e-05,
"loss": 0.932,
"num_input_tokens_seen": 292454400,
"step": 35700
},
{
"epoch": 1.1467742968800052,
"grad_norm": 1.1610244512557983,
"learning_rate": 3.452419227110151e-05,
"loss": 0.9095,
"num_input_tokens_seen": 293273600,
"step": 35800
},
{
"epoch": 1.1499775770388878,
"grad_norm": 0.5575504302978516,
"learning_rate": 3.444576329184651e-05,
"loss": 0.9166,
"num_input_tokens_seen": 294092800,
"step": 35900
},
{
"epoch": 1.1531808571977704,
"grad_norm": 0.5330684781074524,
"learning_rate": 3.436722571803874e-05,
"loss": 0.9445,
"num_input_tokens_seen": 294912000,
"step": 36000
},
{
"epoch": 1.1563841373566532,
"grad_norm": 0.7490949630737305,
"learning_rate": 3.428858045259652e-05,
"loss": 0.8947,
"num_input_tokens_seen": 295731200,
"step": 36100
},
{
"epoch": 1.1595874175155358,
"grad_norm": 1.870923399925232,
"learning_rate": 3.420982839967624e-05,
"loss": 0.9532,
"num_input_tokens_seen": 296550400,
"step": 36200
},
{
"epoch": 1.1627906976744187,
"grad_norm": 3.164524555206299,
"learning_rate": 3.413097046466203e-05,
"loss": 0.9716,
"num_input_tokens_seen": 297369600,
"step": 36300
},
{
"epoch": 1.1659939778333013,
"grad_norm": 1.375303864479065,
"learning_rate": 3.405200755415527e-05,
"loss": 0.9364,
"num_input_tokens_seen": 298188800,
"step": 36400
},
{
"epoch": 1.169197257992184,
"grad_norm": 2.2876625061035156,
"learning_rate": 3.397294057596424e-05,
"loss": 0.8933,
"num_input_tokens_seen": 299008000,
"step": 36500
},
{
"epoch": 1.1724005381510667,
"grad_norm": 0.5776546597480774,
"learning_rate": 3.389377043909361e-05,
"loss": 0.8916,
"num_input_tokens_seen": 299827200,
"step": 36600
},
{
"epoch": 1.1756038183099493,
"grad_norm": 0.7254892587661743,
"learning_rate": 3.381449805373406e-05,
"loss": 0.922,
"num_input_tokens_seen": 300646400,
"step": 36700
},
{
"epoch": 1.1788070984688321,
"grad_norm": 0.7244319319725037,
"learning_rate": 3.3735124331251764e-05,
"loss": 0.9093,
"num_input_tokens_seen": 301465600,
"step": 36800
},
{
"epoch": 1.1820103786277147,
"grad_norm": 0.5166808366775513,
"learning_rate": 3.3655650184177957e-05,
"loss": 0.9553,
"num_input_tokens_seen": 302284800,
"step": 36900
},
{
"epoch": 1.1852136587865976,
"grad_norm": 1.6987115144729614,
"learning_rate": 3.357607652619839e-05,
"loss": 0.8768,
"num_input_tokens_seen": 303104000,
"step": 37000
},
{
"epoch": 1.1884169389454802,
"grad_norm": 0.8271929621696472,
"learning_rate": 3.349640427214287e-05,
"loss": 0.9632,
"num_input_tokens_seen": 303923200,
"step": 37100
},
{
"epoch": 1.1916202191043628,
"grad_norm": 0.7163927555084229,
"learning_rate": 3.341663433797474e-05,
"loss": 0.8682,
"num_input_tokens_seen": 304742400,
"step": 37200
},
{
"epoch": 1.1948234992632456,
"grad_norm": 0.6233458518981934,
"learning_rate": 3.33367676407803e-05,
"loss": 0.9334,
"num_input_tokens_seen": 305561600,
"step": 37300
},
{
"epoch": 1.1980267794221282,
"grad_norm": 1.0882517099380493,
"learning_rate": 3.3256805098758346e-05,
"loss": 0.9073,
"num_input_tokens_seen": 306380800,
"step": 37400
},
{
"epoch": 1.201230059581011,
"grad_norm": 0.8322218656539917,
"learning_rate": 3.3176747631209534e-05,
"loss": 0.9343,
"num_input_tokens_seen": 307200000,
"step": 37500
},
{
"epoch": 1.2044333397398936,
"grad_norm": 1.4540088176727295,
"learning_rate": 3.309659615852586e-05,
"loss": 0.8541,
"num_input_tokens_seen": 308019200,
"step": 37600
},
{
"epoch": 1.2076366198987762,
"grad_norm": 0.6830178499221802,
"learning_rate": 3.301635160218005e-05,
"loss": 0.8889,
"num_input_tokens_seen": 308838400,
"step": 37700
},
{
"epoch": 1.210839900057659,
"grad_norm": 1.9847421646118164,
"learning_rate": 3.293601488471499e-05,
"loss": 0.883,
"num_input_tokens_seen": 309657600,
"step": 37800
},
{
"epoch": 1.2140431802165417,
"grad_norm": 0.8129870891571045,
"learning_rate": 3.285558692973312e-05,
"loss": 0.9474,
"num_input_tokens_seen": 310476800,
"step": 37900
},
{
"epoch": 1.2172464603754245,
"grad_norm": 0.6733205914497375,
"learning_rate": 3.277506866188577e-05,
"loss": 0.904,
"num_input_tokens_seen": 311296000,
"step": 38000
},
{
"epoch": 1.220449740534307,
"grad_norm": 1.2211860418319702,
"learning_rate": 3.269446100686261e-05,
"loss": 0.8879,
"num_input_tokens_seen": 312115200,
"step": 38100
},
{
"epoch": 1.22365302069319,
"grad_norm": 0.7225973010063171,
"learning_rate": 3.261376489138092e-05,
"loss": 0.9139,
"num_input_tokens_seen": 312934400,
"step": 38200
},
{
"epoch": 1.2268563008520725,
"grad_norm": 0.7631468772888184,
"learning_rate": 3.253298124317502e-05,
"loss": 0.959,
"num_input_tokens_seen": 313753600,
"step": 38300
},
{
"epoch": 1.2300595810109551,
"grad_norm": 0.6244317889213562,
"learning_rate": 3.245211099098551e-05,
"loss": 0.9155,
"num_input_tokens_seen": 314572800,
"step": 38400
},
{
"epoch": 1.233262861169838,
"grad_norm": 0.5164452791213989,
"learning_rate": 3.237115506454869e-05,
"loss": 0.8758,
"num_input_tokens_seen": 315392000,
"step": 38500
},
{
"epoch": 1.2364661413287206,
"grad_norm": 0.7463127970695496,
"learning_rate": 3.2290114394585815e-05,
"loss": 0.9116,
"num_input_tokens_seen": 316211200,
"step": 38600
},
{
"epoch": 1.2396694214876034,
"grad_norm": 0.697425901889801,
"learning_rate": 3.22089899127924e-05,
"loss": 0.8743,
"num_input_tokens_seen": 317030400,
"step": 38700
},
{
"epoch": 1.242872701646486,
"grad_norm": 0.6725397706031799,
"learning_rate": 3.212778255182752e-05,
"loss": 0.9507,
"num_input_tokens_seen": 317849600,
"step": 38800
},
{
"epoch": 1.2460759818053686,
"grad_norm": 0.5633911490440369,
"learning_rate": 3.2046493245303066e-05,
"loss": 0.9114,
"num_input_tokens_seen": 318668800,
"step": 38900
},
{
"epoch": 1.2492792619642514,
"grad_norm": 0.4953620135784149,
"learning_rate": 3.196512292777305e-05,
"loss": 0.9392,
"num_input_tokens_seen": 319488000,
"step": 39000
},
{
"epoch": 1.252482542123134,
"grad_norm": 0.5511077642440796,
"learning_rate": 3.1883672534722824e-05,
"loss": 0.9277,
"num_input_tokens_seen": 320307200,
"step": 39100
},
{
"epoch": 1.2556858222820169,
"grad_norm": 1.671002745628357,
"learning_rate": 3.180214300255834e-05,
"loss": 0.8868,
"num_input_tokens_seen": 321126400,
"step": 39200
},
{
"epoch": 1.2588891024408995,
"grad_norm": 0.47333982586860657,
"learning_rate": 3.1720535268595406e-05,
"loss": 0.9129,
"num_input_tokens_seen": 321945600,
"step": 39300
},
{
"epoch": 1.262092382599782,
"grad_norm": 0.6256750226020813,
"learning_rate": 3.1638850271048845e-05,
"loss": 0.9237,
"num_input_tokens_seen": 322764800,
"step": 39400
},
{
"epoch": 1.265295662758665,
"grad_norm": 1.6359134912490845,
"learning_rate": 3.15570889490218e-05,
"loss": 0.8913,
"num_input_tokens_seen": 323584000,
"step": 39500
},
{
"epoch": 1.2684989429175475,
"grad_norm": 0.7079516649246216,
"learning_rate": 3.1475252242494855e-05,
"loss": 0.9312,
"num_input_tokens_seen": 324403200,
"step": 39600
},
{
"epoch": 1.2717022230764303,
"grad_norm": 0.5469818711280823,
"learning_rate": 3.139334109231527e-05,
"loss": 0.8776,
"num_input_tokens_seen": 325222400,
"step": 39700
},
{
"epoch": 1.274905503235313,
"grad_norm": 0.6753129959106445,
"learning_rate": 3.131135644018617e-05,
"loss": 0.9715,
"num_input_tokens_seen": 326041600,
"step": 39800
},
{
"epoch": 1.2781087833941958,
"grad_norm": 1.3139586448669434,
"learning_rate": 3.1229299228655683e-05,
"loss": 0.9268,
"num_input_tokens_seen": 326860800,
"step": 39900
},
{
"epoch": 1.2813120635530784,
"grad_norm": 0.6371886730194092,
"learning_rate": 3.1147170401106154e-05,
"loss": 0.9286,
"num_input_tokens_seen": 327680000,
"step": 40000
},
{
"epoch": 1.284515343711961,
"grad_norm": 0.9212737083435059,
"learning_rate": 3.106497090174325e-05,
"loss": 0.9317,
"num_input_tokens_seen": 328499200,
"step": 40100
},
{
"epoch": 1.2877186238708438,
"grad_norm": 0.6135571002960205,
"learning_rate": 3.098270167558514e-05,
"loss": 0.9152,
"num_input_tokens_seen": 329318400,
"step": 40200
},
{
"epoch": 1.2909219040297264,
"grad_norm": 0.6993789076805115,
"learning_rate": 3.09003636684516e-05,
"loss": 0.9283,
"num_input_tokens_seen": 330137600,
"step": 40300
},
{
"epoch": 1.294125184188609,
"grad_norm": 0.7431827783584595,
"learning_rate": 3.081795782695317e-05,
"loss": 0.9307,
"num_input_tokens_seen": 330956800,
"step": 40400
},
{
"epoch": 1.2973284643474918,
"grad_norm": 0.9774760603904724,
"learning_rate": 3.0735485098480255e-05,
"loss": 0.8917,
"num_input_tokens_seen": 331776000,
"step": 40500
},
{
"epoch": 1.3005317445063747,
"grad_norm": 0.5644115209579468,
"learning_rate": 3.0652946431192244e-05,
"loss": 0.9321,
"num_input_tokens_seen": 332595200,
"step": 40600
},
{
"epoch": 1.3037350246652573,
"grad_norm": 2.2749266624450684,
"learning_rate": 3.057034277400658e-05,
"loss": 0.9211,
"num_input_tokens_seen": 333414400,
"step": 40700
},
{
"epoch": 1.3069383048241399,
"grad_norm": 0.6312987804412842,
"learning_rate": 3.048767507658788e-05,
"loss": 0.913,
"num_input_tokens_seen": 334233600,
"step": 40800
},
{
"epoch": 1.3101415849830227,
"grad_norm": 0.5494056344032288,
"learning_rate": 3.0404944289337034e-05,
"loss": 0.9423,
"num_input_tokens_seen": 335052800,
"step": 40900
},
{
"epoch": 1.3133448651419053,
"grad_norm": 1.3932960033416748,
"learning_rate": 3.0322151363380202e-05,
"loss": 0.9409,
"num_input_tokens_seen": 335872000,
"step": 41000
},
{
"epoch": 1.316548145300788,
"grad_norm": 0.7711178660392761,
"learning_rate": 3.023929725055798e-05,
"loss": 0.9187,
"num_input_tokens_seen": 336691200,
"step": 41100
},
{
"epoch": 1.3197514254596707,
"grad_norm": 0.9086521863937378,
"learning_rate": 3.0156382903414383e-05,
"loss": 1.0063,
"num_input_tokens_seen": 337510400,
"step": 41200
},
{
"epoch": 1.3229547056185533,
"grad_norm": 0.6938414573669434,
"learning_rate": 3.007340927518591e-05,
"loss": 0.8821,
"num_input_tokens_seen": 338329600,
"step": 41300
},
{
"epoch": 1.3261579857774362,
"grad_norm": 0.5269713401794434,
"learning_rate": 2.999037731979063e-05,
"loss": 0.8968,
"num_input_tokens_seen": 339148800,
"step": 41400
},
{
"epoch": 1.3293612659363188,
"grad_norm": 0.69822096824646,
"learning_rate": 2.9907287991817128e-05,
"loss": 0.955,
"num_input_tokens_seen": 339968000,
"step": 41500
},
{
"epoch": 1.3325645460952016,
"grad_norm": 1.9268356561660767,
"learning_rate": 2.9824142246513624e-05,
"loss": 0.9096,
"num_input_tokens_seen": 340787200,
"step": 41600
},
{
"epoch": 1.3357678262540842,
"grad_norm": 0.5475559234619141,
"learning_rate": 2.9740941039776925e-05,
"loss": 0.8828,
"num_input_tokens_seen": 341606400,
"step": 41700
},
{
"epoch": 1.3389711064129668,
"grad_norm": 1.9515366554260254,
"learning_rate": 2.9657685328141466e-05,
"loss": 0.9614,
"num_input_tokens_seen": 342425600,
"step": 41800
},
{
"epoch": 1.3421743865718496,
"grad_norm": 0.6959076523780823,
"learning_rate": 2.95743760687683e-05,
"loss": 0.8739,
"num_input_tokens_seen": 343244800,
"step": 41900
},
{
"epoch": 1.3453776667307322,
"grad_norm": 0.761962890625,
"learning_rate": 2.9491014219434105e-05,
"loss": 0.9595,
"num_input_tokens_seen": 344064000,
"step": 42000
},
{
"epoch": 1.3485809468896148,
"grad_norm": 0.6127232909202576,
"learning_rate": 2.9407600738520162e-05,
"loss": 0.9026,
"num_input_tokens_seen": 344883200,
"step": 42100
},
{
"epoch": 1.3517842270484977,
"grad_norm": 0.6869720220565796,
"learning_rate": 2.9324136585001348e-05,
"loss": 0.9488,
"num_input_tokens_seen": 345702400,
"step": 42200
},
{
"epoch": 1.3549875072073805,
"grad_norm": 0.7109299898147583,
"learning_rate": 2.9240622718435107e-05,
"loss": 0.9433,
"num_input_tokens_seen": 346521600,
"step": 42300
},
{
"epoch": 1.358190787366263,
"grad_norm": 0.6879071593284607,
"learning_rate": 2.9157060098950395e-05,
"loss": 0.8783,
"num_input_tokens_seen": 347340800,
"step": 42400
},
{
"epoch": 1.3613940675251457,
"grad_norm": 0.5623328685760498,
"learning_rate": 2.9073449687236688e-05,
"loss": 0.8925,
"num_input_tokens_seen": 348160000,
"step": 42500
},
{
"epoch": 1.3645973476840285,
"grad_norm": 0.9881012439727783,
"learning_rate": 2.8989792444532892e-05,
"loss": 0.9417,
"num_input_tokens_seen": 348979200,
"step": 42600
},
{
"epoch": 1.3678006278429111,
"grad_norm": 0.6569281816482544,
"learning_rate": 2.890608933261633e-05,
"loss": 0.9262,
"num_input_tokens_seen": 349798400,
"step": 42700
},
{
"epoch": 1.3710039080017937,
"grad_norm": 0.9453611969947815,
"learning_rate": 2.882234131379167e-05,
"loss": 0.9022,
"num_input_tokens_seen": 350617600,
"step": 42800
},
{
"epoch": 1.3742071881606766,
"grad_norm": 0.5668920874595642,
"learning_rate": 2.8738549350879824e-05,
"loss": 0.9306,
"num_input_tokens_seen": 351436800,
"step": 42900
},
{
"epoch": 1.3774104683195592,
"grad_norm": 0.8056479692459106,
"learning_rate": 2.8654714407206956e-05,
"loss": 0.8878,
"num_input_tokens_seen": 352256000,
"step": 43000
},
{
"epoch": 1.380613748478442,
"grad_norm": 0.863929271697998,
"learning_rate": 2.8570837446593336e-05,
"loss": 0.9391,
"num_input_tokens_seen": 353075200,
"step": 43100
},
{
"epoch": 1.3838170286373246,
"grad_norm": 0.5808566808700562,
"learning_rate": 2.8486919433342295e-05,
"loss": 0.9061,
"num_input_tokens_seen": 353894400,
"step": 43200
},
{
"epoch": 1.3870203087962074,
"grad_norm": 0.8920639157295227,
"learning_rate": 2.8402961332229143e-05,
"loss": 0.8854,
"num_input_tokens_seen": 354713600,
"step": 43300
},
{
"epoch": 1.39022358895509,
"grad_norm": 0.6987112760543823,
"learning_rate": 2.831896410849005e-05,
"loss": 0.893,
"num_input_tokens_seen": 355532800,
"step": 43400
},
{
"epoch": 1.3934268691139726,
"grad_norm": 0.6486085653305054,
"learning_rate": 2.823492872781098e-05,
"loss": 0.9166,
"num_input_tokens_seen": 356352000,
"step": 43500
},
{
"epoch": 1.3966301492728554,
"grad_norm": 1.6597498655319214,
"learning_rate": 2.815085615631654e-05,
"loss": 0.9473,
"num_input_tokens_seen": 357171200,
"step": 43600
},
{
"epoch": 1.399833429431738,
"grad_norm": 0.598414957523346,
"learning_rate": 2.8066747360558966e-05,
"loss": 0.9046,
"num_input_tokens_seen": 357990400,
"step": 43700
},
{
"epoch": 1.4030367095906209,
"grad_norm": 2.125504732131958,
"learning_rate": 2.798260330750689e-05,
"loss": 0.9325,
"num_input_tokens_seen": 358809600,
"step": 43800
},
{
"epoch": 1.4062399897495035,
"grad_norm": 0.798989474773407,
"learning_rate": 2.789842496453432e-05,
"loss": 0.9057,
"num_input_tokens_seen": 359628800,
"step": 43900
},
{
"epoch": 1.4094432699083863,
"grad_norm": 0.8189502954483032,
"learning_rate": 2.7814213299409475e-05,
"loss": 0.923,
"num_input_tokens_seen": 360448000,
"step": 44000
},
{
"epoch": 1.412646550067269,
"grad_norm": 0.5460119247436523,
"learning_rate": 2.7729969280283662e-05,
"loss": 0.8764,
"num_input_tokens_seen": 361267200,
"step": 44100
},
{
"epoch": 1.4158498302261515,
"grad_norm": 0.6900705695152283,
"learning_rate": 2.7645693875680163e-05,
"loss": 0.9295,
"num_input_tokens_seen": 362086400,
"step": 44200
},
{
"epoch": 1.4190531103850343,
"grad_norm": 0.7309842705726624,
"learning_rate": 2.7561388054483074e-05,
"loss": 0.8883,
"num_input_tokens_seen": 362905600,
"step": 44300
},
{
"epoch": 1.422256390543917,
"grad_norm": 0.9340581297874451,
"learning_rate": 2.7477052785926178e-05,
"loss": 0.8784,
"num_input_tokens_seen": 363724800,
"step": 44400
},
{
"epoch": 1.4254596707027996,
"grad_norm": 0.6001551151275635,
"learning_rate": 2.7392689039581815e-05,
"loss": 0.949,
"num_input_tokens_seen": 364544000,
"step": 44500
},
{
"epoch": 1.4286629508616824,
"grad_norm": 0.5180249810218811,
"learning_rate": 2.7308297785349724e-05,
"loss": 0.8738,
"num_input_tokens_seen": 365363200,
"step": 44600
},
{
"epoch": 1.431866231020565,
"grad_norm": 0.6243082284927368,
"learning_rate": 2.7223879993445873e-05,
"loss": 0.9074,
"num_input_tokens_seen": 366182400,
"step": 44700
},
{
"epoch": 1.4350695111794478,
"grad_norm": 0.6807756423950195,
"learning_rate": 2.713943663439135e-05,
"loss": 0.953,
"num_input_tokens_seen": 367001600,
"step": 44800
},
{
"epoch": 1.4382727913383304,
"grad_norm": 0.6057282090187073,
"learning_rate": 2.7054968679001174e-05,
"loss": 0.8736,
"num_input_tokens_seen": 367820800,
"step": 44900
},
{
"epoch": 1.4414760714972132,
"grad_norm": 0.593506395816803,
"learning_rate": 2.697047709837312e-05,
"loss": 0.8405,
"num_input_tokens_seen": 368640000,
"step": 45000
},
{
"epoch": 1.4446793516560958,
"grad_norm": 0.7090416550636292,
"learning_rate": 2.6885962863876596e-05,
"loss": 0.8852,
"num_input_tokens_seen": 369459200,
"step": 45100
},
{
"epoch": 1.4478826318149784,
"grad_norm": 0.5391395092010498,
"learning_rate": 2.6801426947141435e-05,
"loss": 0.9029,
"num_input_tokens_seen": 370278400,
"step": 45200
},
{
"epoch": 1.4510859119738613,
"grad_norm": 0.5424131751060486,
"learning_rate": 2.671687032004676e-05,
"loss": 0.8751,
"num_input_tokens_seen": 371097600,
"step": 45300
},
{
"epoch": 1.4542891921327439,
"grad_norm": 0.5781705975532532,
"learning_rate": 2.6632293954709785e-05,
"loss": 0.9417,
"num_input_tokens_seen": 371916800,
"step": 45400
},
{
"epoch": 1.4574924722916267,
"grad_norm": 0.5788801312446594,
"learning_rate": 2.654769882347464e-05,
"loss": 0.9022,
"num_input_tokens_seen": 372736000,
"step": 45500
},
{
"epoch": 1.4606957524505093,
"grad_norm": 0.6637430787086487,
"learning_rate": 2.646308589890123e-05,
"loss": 0.9017,
"num_input_tokens_seen": 373555200,
"step": 45600
},
{
"epoch": 1.4638990326093921,
"grad_norm": 0.7034772634506226,
"learning_rate": 2.637845615375397e-05,
"loss": 0.883,
"num_input_tokens_seen": 374374400,
"step": 45700
},
{
"epoch": 1.4671023127682747,
"grad_norm": 0.6476500630378723,
"learning_rate": 2.629381056099071e-05,
"loss": 0.9469,
"num_input_tokens_seen": 375193600,
"step": 45800
},
{
"epoch": 1.4703055929271573,
"grad_norm": 0.560495913028717,
"learning_rate": 2.6209150093751473e-05,
"loss": 0.885,
"num_input_tokens_seen": 376012800,
"step": 45900
},
{
"epoch": 1.4735088730860402,
"grad_norm": 1.9203239679336548,
"learning_rate": 2.612447572534727e-05,
"loss": 0.9248,
"num_input_tokens_seen": 376832000,
"step": 46000
},
{
"epoch": 1.4767121532449228,
"grad_norm": 2.3468987941741943,
"learning_rate": 2.6039788429248957e-05,
"loss": 0.9041,
"num_input_tokens_seen": 377651200,
"step": 46100
},
{
"epoch": 1.4799154334038054,
"grad_norm": 0.6502100825309753,
"learning_rate": 2.5955089179075997e-05,
"loss": 0.9431,
"num_input_tokens_seen": 378470400,
"step": 46200
},
{
"epoch": 1.4831187135626882,
"grad_norm": 3.609816551208496,
"learning_rate": 2.5870378948585295e-05,
"loss": 0.8893,
"num_input_tokens_seen": 379289600,
"step": 46300
},
{
"epoch": 1.4863219937215708,
"grad_norm": 0.58833247423172,
"learning_rate": 2.5785658711659987e-05,
"loss": 0.9181,
"num_input_tokens_seen": 380108800,
"step": 46400
},
{
"epoch": 1.4895252738804536,
"grad_norm": 1.7303794622421265,
"learning_rate": 2.570092944229826e-05,
"loss": 0.8921,
"num_input_tokens_seen": 380928000,
"step": 46500
},
{
"epoch": 1.4927285540393362,
"grad_norm": 0.7278485894203186,
"learning_rate": 2.5616192114602127e-05,
"loss": 0.8693,
"num_input_tokens_seen": 381747200,
"step": 46600
},
{
"epoch": 1.495931834198219,
"grad_norm": 0.7616570591926575,
"learning_rate": 2.5531447702766254e-05,
"loss": 0.9397,
"num_input_tokens_seen": 382566400,
"step": 46700
},
{
"epoch": 1.4991351143571017,
"grad_norm": 0.11684958636760712,
"learning_rate": 2.5446697181066747e-05,
"loss": 0.8526,
"num_input_tokens_seen": 383385600,
"step": 46800
},
{
"epoch": 1.5023383945159843,
"grad_norm": 0.7726488709449768,
"learning_rate": 2.536194152384997e-05,
"loss": 0.9122,
"num_input_tokens_seen": 384204800,
"step": 46900
},
{
"epoch": 1.505541674674867,
"grad_norm": 0.7091355323791504,
"learning_rate": 2.527718170552129e-05,
"loss": 0.8666,
"num_input_tokens_seen": 385024000,
"step": 47000
},
{
"epoch": 1.5087449548337497,
"grad_norm": 2.5142340660095215,
"learning_rate": 2.519241870053396e-05,
"loss": 0.911,
"num_input_tokens_seen": 385843200,
"step": 47100
},
{
"epoch": 1.5119482349926323,
"grad_norm": 0.6862989664077759,
"learning_rate": 2.5107653483377852e-05,
"loss": 0.974,
"num_input_tokens_seen": 386662400,
"step": 47200
},
{
"epoch": 1.5151515151515151,
"grad_norm": 2.351198196411133,
"learning_rate": 2.502288702856824e-05,
"loss": 0.8986,
"num_input_tokens_seen": 387481600,
"step": 47300
},
{
"epoch": 1.518354795310398,
"grad_norm": 0.7517640590667725,
"learning_rate": 2.4938120310634682e-05,
"loss": 0.8549,
"num_input_tokens_seen": 388300800,
"step": 47400
},
{
"epoch": 1.5215580754692806,
"grad_norm": 2.709975004196167,
"learning_rate": 2.485335430410972e-05,
"loss": 0.899,
"num_input_tokens_seen": 389120000,
"step": 47500
},
{
"epoch": 1.5247613556281632,
"grad_norm": 0.7952636480331421,
"learning_rate": 2.4768589983517716e-05,
"loss": 0.8622,
"num_input_tokens_seen": 389939200,
"step": 47600
},
{
"epoch": 1.527964635787046,
"grad_norm": 0.7378533482551575,
"learning_rate": 2.4683828323363687e-05,
"loss": 0.8334,
"num_input_tokens_seen": 390758400,
"step": 47700
},
{
"epoch": 1.5311679159459286,
"grad_norm": 2.5980470180511475,
"learning_rate": 2.459907029812203e-05,
"loss": 0.9028,
"num_input_tokens_seen": 391577600,
"step": 47800
},
{
"epoch": 1.5343711961048112,
"grad_norm": 0.6807860732078552,
"learning_rate": 2.4514316882225347e-05,
"loss": 0.9259,
"num_input_tokens_seen": 392396800,
"step": 47900
},
{
"epoch": 1.537574476263694,
"grad_norm": 2.3691670894622803,
"learning_rate": 2.442956905005328e-05,
"loss": 0.8639,
"num_input_tokens_seen": 393216000,
"step": 48000
},
{
"epoch": 1.5407777564225769,
"grad_norm": 0.7466169595718384,
"learning_rate": 2.434482777592125e-05,
"loss": 0.8828,
"num_input_tokens_seen": 394035200,
"step": 48100
},
{
"epoch": 1.5439810365814595,
"grad_norm": 0.5329868793487549,
"learning_rate": 2.426009403406931e-05,
"loss": 0.8802,
"num_input_tokens_seen": 394854400,
"step": 48200
},
{
"epoch": 1.547184316740342,
"grad_norm": 0.6394245028495789,
"learning_rate": 2.4175368798650884e-05,
"loss": 0.8811,
"num_input_tokens_seen": 395673600,
"step": 48300
},
{
"epoch": 1.550387596899225,
"grad_norm": 0.9404513239860535,
"learning_rate": 2.4090653043721612e-05,
"loss": 0.8663,
"num_input_tokens_seen": 396492800,
"step": 48400
},
{
"epoch": 1.5535908770581075,
"grad_norm": 0.7973567843437195,
"learning_rate": 2.4005947743228157e-05,
"loss": 0.9452,
"num_input_tokens_seen": 397312000,
"step": 48500
},
{
"epoch": 1.55679415721699,
"grad_norm": 1.8970893621444702,
"learning_rate": 2.3921253870996972e-05,
"loss": 0.8968,
"num_input_tokens_seen": 398131200,
"step": 48600
},
{
"epoch": 1.559997437375873,
"grad_norm": 0.7782315015792847,
"learning_rate": 2.383657240072314e-05,
"loss": 0.9475,
"num_input_tokens_seen": 398950400,
"step": 48700
},
{
"epoch": 1.5632007175347555,
"grad_norm": 0.72723788022995,
"learning_rate": 2.375190430595914e-05,
"loss": 0.9347,
"num_input_tokens_seen": 399769600,
"step": 48800
},
{
"epoch": 1.5664039976936381,
"grad_norm": 0.5238316655158997,
"learning_rate": 2.366725056010369e-05,
"loss": 0.8969,
"num_input_tokens_seen": 400588800,
"step": 48900
},
{
"epoch": 1.569607277852521,
"grad_norm": 0.7676683664321899,
"learning_rate": 2.3582612136390556e-05,
"loss": 0.8926,
"num_input_tokens_seen": 401408000,
"step": 49000
},
{
"epoch": 1.5728105580114038,
"grad_norm": 1.64457106590271,
"learning_rate": 2.349799000787733e-05,
"loss": 0.9027,
"num_input_tokens_seen": 402227200,
"step": 49100
},
{
"epoch": 1.5760138381702864,
"grad_norm": 0.5461480617523193,
"learning_rate": 2.3413385147434285e-05,
"loss": 0.8651,
"num_input_tokens_seen": 403046400,
"step": 49200
},
{
"epoch": 1.579217118329169,
"grad_norm": 0.527300238609314,
"learning_rate": 2.332879852773314e-05,
"loss": 0.8354,
"num_input_tokens_seen": 403865600,
"step": 49300
},
{
"epoch": 1.5824203984880518,
"grad_norm": 0.8455817699432373,
"learning_rate": 2.3244231121235936e-05,
"loss": 0.903,
"num_input_tokens_seen": 404684800,
"step": 49400
},
{
"epoch": 1.5856236786469344,
"grad_norm": 0.8457258939743042,
"learning_rate": 2.3159683900183812e-05,
"loss": 0.9085,
"num_input_tokens_seen": 405504000,
"step": 49500
},
{
"epoch": 1.588826958805817,
"grad_norm": 0.7063552141189575,
"learning_rate": 2.3075157836585854e-05,
"loss": 0.9002,
"num_input_tokens_seen": 406323200,
"step": 49600
},
{
"epoch": 1.5920302389646999,
"grad_norm": 0.6034948229789734,
"learning_rate": 2.2990653902207875e-05,
"loss": 0.8665,
"num_input_tokens_seen": 407142400,
"step": 49700
},
{
"epoch": 1.5952335191235827,
"grad_norm": 0.6883265972137451,
"learning_rate": 2.2906173068561324e-05,
"loss": 0.9031,
"num_input_tokens_seen": 407961600,
"step": 49800
},
{
"epoch": 1.5984367992824653,
"grad_norm": 0.6610883474349976,
"learning_rate": 2.282171630689203e-05,
"loss": 0.9153,
"num_input_tokens_seen": 408780800,
"step": 49900
},
{
"epoch": 1.601640079441348,
"grad_norm": 1.8148962259292603,
"learning_rate": 2.2737284588169107e-05,
"loss": 0.8904,
"num_input_tokens_seen": 409600000,
"step": 50000
},
{
"epoch": 1.6048433596002307,
"grad_norm": 0.8317341804504395,
"learning_rate": 2.2652878883073736e-05,
"loss": 0.8847,
"num_input_tokens_seen": 410419200,
"step": 50100
},
{
"epoch": 1.6080466397591133,
"grad_norm": 0.5359209179878235,
"learning_rate": 2.2568500161988023e-05,
"loss": 0.8983,
"num_input_tokens_seen": 411238400,
"step": 50200
},
{
"epoch": 1.611249919917996,
"grad_norm": 0.6819952726364136,
"learning_rate": 2.2484149394983882e-05,
"loss": 0.9138,
"num_input_tokens_seen": 412057600,
"step": 50300
},
{
"epoch": 1.6144532000768788,
"grad_norm": 0.8475795984268188,
"learning_rate": 2.239982755181181e-05,
"loss": 0.8536,
"num_input_tokens_seen": 412876800,
"step": 50400
},
{
"epoch": 1.6176564802357616,
"grad_norm": 1.1045705080032349,
"learning_rate": 2.2315535601889814e-05,
"loss": 0.9137,
"num_input_tokens_seen": 413696000,
"step": 50500
},
{
"epoch": 1.620859760394644,
"grad_norm": 0.6131917834281921,
"learning_rate": 2.2231274514292196e-05,
"loss": 0.8992,
"num_input_tokens_seen": 414515200,
"step": 50600
},
{
"epoch": 1.6240630405535268,
"grad_norm": 0.6096556186676025,
"learning_rate": 2.214704525773846e-05,
"loss": 0.9211,
"num_input_tokens_seen": 415334400,
"step": 50700
},
{
"epoch": 1.6272663207124096,
"grad_norm": 0.5279362797737122,
"learning_rate": 2.2062848800582168e-05,
"loss": 0.9231,
"num_input_tokens_seen": 416153600,
"step": 50800
},
{
"epoch": 1.6304696008712922,
"grad_norm": 0.5645897388458252,
"learning_rate": 2.197868611079978e-05,
"loss": 0.8579,
"num_input_tokens_seen": 416972800,
"step": 50900
},
{
"epoch": 1.6336728810301748,
"grad_norm": 0.5469439029693604,
"learning_rate": 2.189455815597957e-05,
"loss": 0.8802,
"num_input_tokens_seen": 417792000,
"step": 51000
},
{
"epoch": 1.6368761611890577,
"grad_norm": 0.7165865898132324,
"learning_rate": 2.1810465903310445e-05,
"loss": 0.897,
"num_input_tokens_seen": 418611200,
"step": 51100
},
{
"epoch": 1.6400794413479403,
"grad_norm": 0.49263107776641846,
"learning_rate": 2.1726410319570874e-05,
"loss": 0.9145,
"num_input_tokens_seen": 419430400,
"step": 51200
},
{
"epoch": 1.6432827215068229,
"grad_norm": 0.7984305620193481,
"learning_rate": 2.164239237111776e-05,
"loss": 0.9656,
"num_input_tokens_seen": 420249600,
"step": 51300
},
{
"epoch": 1.6464860016657057,
"grad_norm": 0.6783995628356934,
"learning_rate": 2.1558413023875334e-05,
"loss": 0.8937,
"num_input_tokens_seen": 421068800,
"step": 51400
},
{
"epoch": 1.6496892818245885,
"grad_norm": 0.6700116395950317,
"learning_rate": 2.147447324332403e-05,
"loss": 0.8966,
"num_input_tokens_seen": 421888000,
"step": 51500
},
{
"epoch": 1.6528925619834711,
"grad_norm": 2.6840033531188965,
"learning_rate": 2.1390573994489377e-05,
"loss": 0.9922,
"num_input_tokens_seen": 422707200,
"step": 51600
},
{
"epoch": 1.6560958421423537,
"grad_norm": 0.6062913537025452,
"learning_rate": 2.1306716241930968e-05,
"loss": 0.9201,
"num_input_tokens_seen": 423526400,
"step": 51700
},
{
"epoch": 1.6592991223012366,
"grad_norm": 0.7637689113616943,
"learning_rate": 2.1222900949731297e-05,
"loss": 0.9039,
"num_input_tokens_seen": 424345600,
"step": 51800
},
{
"epoch": 1.6625024024601192,
"grad_norm": 3.154482841491699,
"learning_rate": 2.1139129081484734e-05,
"loss": 0.968,
"num_input_tokens_seen": 425164800,
"step": 51900
},
{
"epoch": 1.6657056826190018,
"grad_norm": 1.900366187095642,
"learning_rate": 2.1055401600286386e-05,
"loss": 0.9064,
"num_input_tokens_seen": 425984000,
"step": 52000
},
{
"epoch": 1.6689089627778846,
"grad_norm": 0.6276770830154419,
"learning_rate": 2.0971719468721077e-05,
"loss": 0.8786,
"num_input_tokens_seen": 426803200,
"step": 52100
},
{
"epoch": 1.6721122429367674,
"grad_norm": 0.7337915301322937,
"learning_rate": 2.0888083648852267e-05,
"loss": 0.9213,
"num_input_tokens_seen": 427622400,
"step": 52200
},
{
"epoch": 1.6753155230956498,
"grad_norm": 0.6604040861129761,
"learning_rate": 2.0804495102210975e-05,
"loss": 0.944,
"num_input_tokens_seen": 428441600,
"step": 52300
},
{
"epoch": 1.6785188032545326,
"grad_norm": 0.6165716648101807,
"learning_rate": 2.0720954789784753e-05,
"loss": 0.8767,
"num_input_tokens_seen": 429260800,
"step": 52400
},
{
"epoch": 1.6817220834134154,
"grad_norm": 1.7939884662628174,
"learning_rate": 2.0637463672006595e-05,
"loss": 0.9095,
"num_input_tokens_seen": 430080000,
"step": 52500
},
{
"epoch": 1.684925363572298,
"grad_norm": 0.6687926054000854,
"learning_rate": 2.0554022708743943e-05,
"loss": 0.8976,
"num_input_tokens_seen": 430899200,
"step": 52600
},
{
"epoch": 1.6881286437311807,
"grad_norm": 0.7300702929496765,
"learning_rate": 2.0470632859287628e-05,
"loss": 0.9377,
"num_input_tokens_seen": 431718400,
"step": 52700
},
{
"epoch": 1.6913319238900635,
"grad_norm": 0.590376615524292,
"learning_rate": 2.0387295082340835e-05,
"loss": 0.8911,
"num_input_tokens_seen": 432537600,
"step": 52800
},
{
"epoch": 1.694535204048946,
"grad_norm": 0.556515097618103,
"learning_rate": 2.0304010336008112e-05,
"loss": 0.8771,
"num_input_tokens_seen": 433356800,
"step": 52900
},
{
"epoch": 1.6977384842078287,
"grad_norm": 0.6625654101371765,
"learning_rate": 2.0220779577784298e-05,
"loss": 0.9529,
"num_input_tokens_seen": 434176000,
"step": 53000
},
{
"epoch": 1.7009417643667115,
"grad_norm": 0.5537979602813721,
"learning_rate": 2.0137603764543573e-05,
"loss": 0.8813,
"num_input_tokens_seen": 434995200,
"step": 53100
},
{
"epoch": 1.7041450445255943,
"grad_norm": 0.49151819944381714,
"learning_rate": 2.0054483852528435e-05,
"loss": 0.8268,
"num_input_tokens_seen": 435814400,
"step": 53200
},
{
"epoch": 1.707348324684477,
"grad_norm": 0.6030770540237427,
"learning_rate": 1.9971420797338708e-05,
"loss": 0.9116,
"num_input_tokens_seen": 436633600,
"step": 53300
},
{
"epoch": 1.7105516048433596,
"grad_norm": 0.872156023979187,
"learning_rate": 1.9888415553920525e-05,
"loss": 0.8564,
"num_input_tokens_seen": 437452800,
"step": 53400
},
{
"epoch": 1.7137548850022424,
"grad_norm": 0.608736515045166,
"learning_rate": 1.9805469076555418e-05,
"loss": 0.8656,
"num_input_tokens_seen": 438272000,
"step": 53500
},
{
"epoch": 1.716958165161125,
"grad_norm": 0.6439238786697388,
"learning_rate": 1.9722582318849274e-05,
"loss": 0.8819,
"num_input_tokens_seen": 439091200,
"step": 53600
},
{
"epoch": 1.7201614453200076,
"grad_norm": 0.5254938006401062,
"learning_rate": 1.9639756233721433e-05,
"loss": 0.9118,
"num_input_tokens_seen": 439910400,
"step": 53700
},
{
"epoch": 1.7233647254788904,
"grad_norm": 0.6956652998924255,
"learning_rate": 1.9556991773393686e-05,
"loss": 0.8578,
"num_input_tokens_seen": 440729600,
"step": 53800
},
{
"epoch": 1.7265680056377732,
"grad_norm": 0.5322553515434265,
"learning_rate": 1.9474289889379334e-05,
"loss": 0.8907,
"num_input_tokens_seen": 441548800,
"step": 53900
},
{
"epoch": 1.7297712857966556,
"grad_norm": 0.706683874130249,
"learning_rate": 1.9391651532472296e-05,
"loss": 0.8853,
"num_input_tokens_seen": 442368000,
"step": 54000
},
{
"epoch": 1.7329745659555384,
"grad_norm": 1.7393512725830078,
"learning_rate": 1.930907765273611e-05,
"loss": 0.8942,
"num_input_tokens_seen": 443187200,
"step": 54100
},
{
"epoch": 1.7361778461144213,
"grad_norm": 0.6126461029052734,
"learning_rate": 1.922656919949306e-05,
"loss": 0.861,
"num_input_tokens_seen": 444006400,
"step": 54200
},
{
"epoch": 1.7393811262733039,
"grad_norm": 15.058053016662598,
"learning_rate": 1.914412712131325e-05,
"loss": 0.8764,
"num_input_tokens_seen": 444825600,
"step": 54300
},
{
"epoch": 1.7425844064321865,
"grad_norm": 1.590517520904541,
"learning_rate": 1.906175236600366e-05,
"loss": 0.9054,
"num_input_tokens_seen": 445644800,
"step": 54400
},
{
"epoch": 1.7457876865910693,
"grad_norm": 2.823185920715332,
"learning_rate": 1.8979445880597332e-05,
"loss": 0.9166,
"num_input_tokens_seen": 446464000,
"step": 54500
},
{
"epoch": 1.748990966749952,
"grad_norm": 0.6295785903930664,
"learning_rate": 1.8897208611342392e-05,
"loss": 0.893,
"num_input_tokens_seen": 447283200,
"step": 54600
},
{
"epoch": 1.7521942469088345,
"grad_norm": 2.9604554176330566,
"learning_rate": 1.881504150369125e-05,
"loss": 0.8883,
"num_input_tokens_seen": 448102400,
"step": 54700
},
{
"epoch": 1.7553975270677173,
"grad_norm": 0.12940554320812225,
"learning_rate": 1.873294550228965e-05,
"loss": 0.9114,
"num_input_tokens_seen": 448921600,
"step": 54800
},
{
"epoch": 1.7586008072266002,
"grad_norm": 0.6710172891616821,
"learning_rate": 1.8650921550965884e-05,
"loss": 0.9675,
"num_input_tokens_seen": 449740800,
"step": 54900
},
{
"epoch": 1.7618040873854828,
"grad_norm": 0.5467862486839294,
"learning_rate": 1.8568970592719903e-05,
"loss": 0.9055,
"num_input_tokens_seen": 450560000,
"step": 55000
},
{
"epoch": 1.7650073675443654,
"grad_norm": 1.6943007707595825,
"learning_rate": 1.8487093569712482e-05,
"loss": 0.8754,
"num_input_tokens_seen": 451379200,
"step": 55100
},
{
"epoch": 1.7682106477032482,
"grad_norm": 0.6068347692489624,
"learning_rate": 1.84052914232544e-05,
"loss": 0.9695,
"num_input_tokens_seen": 452198400,
"step": 55200
},
{
"epoch": 1.7714139278621308,
"grad_norm": 2.650592565536499,
"learning_rate": 1.8323565093795576e-05,
"loss": 0.8756,
"num_input_tokens_seen": 453017600,
"step": 55300
},
{
"epoch": 1.7746172080210134,
"grad_norm": 2.3554019927978516,
"learning_rate": 1.824191552091431e-05,
"loss": 0.8884,
"num_input_tokens_seen": 453836800,
"step": 55400
},
{
"epoch": 1.7778204881798962,
"grad_norm": 0.5100352764129639,
"learning_rate": 1.8160343643306467e-05,
"loss": 0.901,
"num_input_tokens_seen": 454656000,
"step": 55500
},
{
"epoch": 1.781023768338779,
"grad_norm": 2.276134490966797,
"learning_rate": 1.8078850398774666e-05,
"loss": 0.8653,
"num_input_tokens_seen": 455475200,
"step": 55600
},
{
"epoch": 1.7842270484976614,
"grad_norm": 0.6568858027458191,
"learning_rate": 1.7997436724217517e-05,
"loss": 0.9307,
"num_input_tokens_seen": 456294400,
"step": 55700
},
{
"epoch": 1.7874303286565443,
"grad_norm": 0.5729939341545105,
"learning_rate": 1.7916103555618818e-05,
"loss": 0.8938,
"num_input_tokens_seen": 457113600,
"step": 55800
},
{
"epoch": 1.790633608815427,
"grad_norm": 0.4960566759109497,
"learning_rate": 1.7834851828036855e-05,
"loss": 0.8622,
"num_input_tokens_seen": 457932800,
"step": 55900
},
{
"epoch": 1.7938368889743097,
"grad_norm": 0.6195512413978577,
"learning_rate": 1.7753682475593587e-05,
"loss": 0.9165,
"num_input_tokens_seen": 458752000,
"step": 56000
},
{
"epoch": 1.7970401691331923,
"grad_norm": 0.7224614024162292,
"learning_rate": 1.7672596431463963e-05,
"loss": 0.9159,
"num_input_tokens_seen": 459571200,
"step": 56100
},
{
"epoch": 1.8002434492920751,
"grad_norm": 0.683172881603241,
"learning_rate": 1.7591594627865134e-05,
"loss": 0.928,
"num_input_tokens_seen": 460390400,
"step": 56200
},
{
"epoch": 1.8034467294509577,
"grad_norm": 0.6346443891525269,
"learning_rate": 1.7510677996045787e-05,
"loss": 0.8891,
"num_input_tokens_seen": 461209600,
"step": 56300
},
{
"epoch": 1.8066500096098403,
"grad_norm": 0.5797076225280762,
"learning_rate": 1.7429847466275424e-05,
"loss": 0.9163,
"num_input_tokens_seen": 462028800,
"step": 56400
},
{
"epoch": 1.8098532897687232,
"grad_norm": 1.201037883758545,
"learning_rate": 1.734910396783364e-05,
"loss": 0.9401,
"num_input_tokens_seen": 462848000,
"step": 56500
},
{
"epoch": 1.813056569927606,
"grad_norm": 0.6015352606773376,
"learning_rate": 1.7268448428999508e-05,
"loss": 0.9391,
"num_input_tokens_seen": 463667200,
"step": 56600
},
{
"epoch": 1.8162598500864886,
"grad_norm": 0.6725329756736755,
"learning_rate": 1.71878817770408e-05,
"loss": 0.8751,
"num_input_tokens_seen": 464486400,
"step": 56700
},
{
"epoch": 1.8194631302453712,
"grad_norm": 0.7582192420959473,
"learning_rate": 1.7107404938203422e-05,
"loss": 0.9578,
"num_input_tokens_seen": 465305600,
"step": 56800
},
{
"epoch": 1.822666410404254,
"grad_norm": 0.5181425213813782,
"learning_rate": 1.702701883770074e-05,
"loss": 0.9462,
"num_input_tokens_seen": 466124800,
"step": 56900
},
{
"epoch": 1.8258696905631366,
"grad_norm": 0.672991931438446,
"learning_rate": 1.6946724399702905e-05,
"loss": 0.8676,
"num_input_tokens_seen": 466944000,
"step": 57000
},
{
"epoch": 1.8290729707220192,
"grad_norm": 2.6324303150177,
"learning_rate": 1.6866522547326292e-05,
"loss": 0.9282,
"num_input_tokens_seen": 467763200,
"step": 57100
},
{
"epoch": 1.832276250880902,
"grad_norm": 0.5964205861091614,
"learning_rate": 1.6786414202622818e-05,
"loss": 0.8611,
"num_input_tokens_seen": 468582400,
"step": 57200
},
{
"epoch": 1.835479531039785,
"grad_norm": 1.6168113946914673,
"learning_rate": 1.670640028656939e-05,
"loss": 0.8977,
"num_input_tokens_seen": 469401600,
"step": 57300
},
{
"epoch": 1.8386828111986673,
"grad_norm": 0.5584040284156799,
"learning_rate": 1.662648171905731e-05,
"loss": 0.9157,
"num_input_tokens_seen": 470220800,
"step": 57400
},
{
"epoch": 1.84188609135755,
"grad_norm": 0.6906948685646057,
"learning_rate": 1.654665941888169e-05,
"loss": 0.8808,
"num_input_tokens_seen": 471040000,
"step": 57500
},
{
"epoch": 1.845089371516433,
"grad_norm": 0.8261626958847046,
"learning_rate": 1.6466934303730866e-05,
"loss": 0.9322,
"num_input_tokens_seen": 471859200,
"step": 57600
},
{
"epoch": 1.8482926516753155,
"grad_norm": 0.5074647068977356,
"learning_rate": 1.6387307290175914e-05,
"loss": 0.9141,
"num_input_tokens_seen": 472678400,
"step": 57700
},
{
"epoch": 1.8514959318341981,
"grad_norm": 1.8539708852767944,
"learning_rate": 1.6307779293660034e-05,
"loss": 0.8777,
"num_input_tokens_seen": 473497600,
"step": 57800
},
{
"epoch": 1.854699211993081,
"grad_norm": 2.2079038619995117,
"learning_rate": 1.622835122848809e-05,
"loss": 0.8596,
"num_input_tokens_seen": 474316800,
"step": 57900
},
{
"epoch": 1.8579024921519636,
"grad_norm": 0.670155942440033,
"learning_rate": 1.6149024007816067e-05,
"loss": 0.9112,
"num_input_tokens_seen": 475136000,
"step": 58000
},
{
"epoch": 1.8611057723108462,
"grad_norm": 0.8173292875289917,
"learning_rate": 1.6069798543640543e-05,
"loss": 0.9513,
"num_input_tokens_seen": 475955200,
"step": 58100
},
{
"epoch": 1.864309052469729,
"grad_norm": 0.5929046273231506,
"learning_rate": 1.599067574678829e-05,
"loss": 0.8633,
"num_input_tokens_seen": 476774400,
"step": 58200
},
{
"epoch": 1.8675123326286118,
"grad_norm": 0.6177115440368652,
"learning_rate": 1.591165652690571e-05,
"loss": 0.8829,
"num_input_tokens_seen": 477593600,
"step": 58300
},
{
"epoch": 1.8707156127874944,
"grad_norm": 5.405032157897949,
"learning_rate": 1.5832741792448447e-05,
"loss": 0.853,
"num_input_tokens_seen": 478412800,
"step": 58400
},
{
"epoch": 1.873918892946377,
"grad_norm": 0.8819538950920105,
"learning_rate": 1.5753932450670892e-05,
"loss": 0.8632,
"num_input_tokens_seen": 479232000,
"step": 58500
},
{
"epoch": 1.8771221731052599,
"grad_norm": 0.7577266693115234,
"learning_rate": 1.5675229407615773e-05,
"loss": 0.8691,
"num_input_tokens_seen": 480051200,
"step": 58600
},
{
"epoch": 1.8803254532641425,
"grad_norm": 0.5581927299499512,
"learning_rate": 1.5596633568103764e-05,
"loss": 0.8898,
"num_input_tokens_seen": 480870400,
"step": 58700
},
{
"epoch": 1.883528733423025,
"grad_norm": 1.5271930694580078,
"learning_rate": 1.5518145835723034e-05,
"loss": 0.9001,
"num_input_tokens_seen": 481689600,
"step": 58800
},
{
"epoch": 1.886732013581908,
"grad_norm": 0.594035804271698,
"learning_rate": 1.54397671128189e-05,
"loss": 0.8988,
"num_input_tokens_seen": 482508800,
"step": 58900
},
{
"epoch": 1.8899352937407907,
"grad_norm": 0.778454601764679,
"learning_rate": 1.5361498300483423e-05,
"loss": 0.8744,
"num_input_tokens_seen": 483328000,
"step": 59000
},
{
"epoch": 1.893138573899673,
"grad_norm": 0.6719622611999512,
"learning_rate": 1.5283340298545056e-05,
"loss": 0.9189,
"num_input_tokens_seen": 484147200,
"step": 59100
},
{
"epoch": 1.896341854058556,
"grad_norm": 0.7632321119308472,
"learning_rate": 1.5205294005558335e-05,
"loss": 0.9133,
"num_input_tokens_seen": 484966400,
"step": 59200
},
{
"epoch": 1.8995451342174388,
"grad_norm": 2.033229112625122,
"learning_rate": 1.5127360318793481e-05,
"loss": 0.8913,
"num_input_tokens_seen": 485785600,
"step": 59300
},
{
"epoch": 1.9027484143763214,
"grad_norm": 0.598871648311615,
"learning_rate": 1.5049540134226158e-05,
"loss": 0.8857,
"num_input_tokens_seen": 486604800,
"step": 59400
},
{
"epoch": 1.905951694535204,
"grad_norm": 1.5140035152435303,
"learning_rate": 1.4971834346527102e-05,
"loss": 0.9104,
"num_input_tokens_seen": 487424000,
"step": 59500
},
{
"epoch": 1.9091549746940868,
"grad_norm": 1.2196921110153198,
"learning_rate": 1.4894243849051889e-05,
"loss": 0.8936,
"num_input_tokens_seen": 488243200,
"step": 59600
},
{
"epoch": 1.9123582548529694,
"grad_norm": 0.6041728854179382,
"learning_rate": 1.4816769533830638e-05,
"loss": 0.9233,
"num_input_tokens_seen": 489062400,
"step": 59700
},
{
"epoch": 1.915561535011852,
"grad_norm": 0.585239589214325,
"learning_rate": 1.4739412291557774e-05,
"loss": 0.893,
"num_input_tokens_seen": 489881600,
"step": 59800
},
{
"epoch": 1.9187648151707348,
"grad_norm": 0.5198357701301575,
"learning_rate": 1.4662173011581757e-05,
"loss": 0.8643,
"num_input_tokens_seen": 490700800,
"step": 59900
},
{
"epoch": 1.9219680953296177,
"grad_norm": 1.5068873167037964,
"learning_rate": 1.4585052581894881e-05,
"loss": 0.9376,
"num_input_tokens_seen": 491520000,
"step": 60000
},
{
"epoch": 1.9251713754885003,
"grad_norm": 1.573378562927246,
"learning_rate": 1.4508051889123075e-05,
"loss": 0.9354,
"num_input_tokens_seen": 492339200,
"step": 60100
},
{
"epoch": 1.9283746556473829,
"grad_norm": 0.7995052933692932,
"learning_rate": 1.4431171818515698e-05,
"loss": 0.8201,
"num_input_tokens_seen": 493158400,
"step": 60200
},
{
"epoch": 1.9315779358062657,
"grad_norm": 0.7116925716400146,
"learning_rate": 1.4354413253935336e-05,
"loss": 0.8322,
"num_input_tokens_seen": 493977600,
"step": 60300
},
{
"epoch": 1.9347812159651483,
"grad_norm": 0.714451253414154,
"learning_rate": 1.4277777077847665e-05,
"loss": 0.9181,
"num_input_tokens_seen": 494796800,
"step": 60400
},
{
"epoch": 1.937984496124031,
"grad_norm": 0.7062659859657288,
"learning_rate": 1.420126417131133e-05,
"loss": 0.8783,
"num_input_tokens_seen": 495616000,
"step": 60500
},
{
"epoch": 1.9411877762829137,
"grad_norm": 0.5767313838005066,
"learning_rate": 1.4124875413967767e-05,
"loss": 0.9239,
"num_input_tokens_seen": 496435200,
"step": 60600
},
{
"epoch": 1.9443910564417966,
"grad_norm": 0.7007090449333191,
"learning_rate": 1.4048611684031138e-05,
"loss": 0.8908,
"num_input_tokens_seen": 497254400,
"step": 60700
},
{
"epoch": 1.947594336600679,
"grad_norm": 0.663779079914093,
"learning_rate": 1.3972473858278184e-05,
"loss": 0.8845,
"num_input_tokens_seen": 498073600,
"step": 60800
},
{
"epoch": 1.9507976167595618,
"grad_norm": 1.9937938451766968,
"learning_rate": 1.3896462812038168e-05,
"loss": 0.8902,
"num_input_tokens_seen": 498892800,
"step": 60900
},
{
"epoch": 1.9540008969184446,
"grad_norm": 0.5911014676094055,
"learning_rate": 1.3820579419182838e-05,
"loss": 0.9283,
"num_input_tokens_seen": 499712000,
"step": 61000
},
{
"epoch": 1.9572041770773272,
"grad_norm": 0.680264949798584,
"learning_rate": 1.3744824552116343e-05,
"loss": 0.9166,
"num_input_tokens_seen": 500531200,
"step": 61100
},
{
"epoch": 1.9604074572362098,
"grad_norm": 0.5298569202423096,
"learning_rate": 1.3669199081765232e-05,
"loss": 0.9069,
"num_input_tokens_seen": 501350400,
"step": 61200
},
{
"epoch": 1.9636107373950926,
"grad_norm": 2.5101547241210938,
"learning_rate": 1.3593703877568407e-05,
"loss": 0.9138,
"num_input_tokens_seen": 502169600,
"step": 61300
},
{
"epoch": 1.9668140175539752,
"grad_norm": 1.6266756057739258,
"learning_rate": 1.3518339807467138e-05,
"loss": 0.8311,
"num_input_tokens_seen": 502988800,
"step": 61400
},
{
"epoch": 1.9700172977128578,
"grad_norm": 0.6949862241744995,
"learning_rate": 1.3443107737895121e-05,
"loss": 0.9508,
"num_input_tokens_seen": 503808000,
"step": 61500
},
{
"epoch": 1.9732205778717407,
"grad_norm": 1.9142687320709229,
"learning_rate": 1.3368008533768478e-05,
"loss": 0.8986,
"num_input_tokens_seen": 504627200,
"step": 61600
},
{
"epoch": 1.9764238580306235,
"grad_norm": 1.5811573266983032,
"learning_rate": 1.3293043058475835e-05,
"loss": 0.8775,
"num_input_tokens_seen": 505446400,
"step": 61700
},
{
"epoch": 1.979627138189506,
"grad_norm": 0.5435724258422852,
"learning_rate": 1.321821217386836e-05,
"loss": 0.8588,
"num_input_tokens_seen": 506265600,
"step": 61800
},
{
"epoch": 1.9828304183483887,
"grad_norm": 0.5689346194267273,
"learning_rate": 1.314351674024989e-05,
"loss": 0.9,
"num_input_tokens_seen": 507084800,
"step": 61900
},
{
"epoch": 1.9860336985072715,
"grad_norm": 0.5658956170082092,
"learning_rate": 1.3068957616367045e-05,
"loss": 0.8931,
"num_input_tokens_seen": 507904000,
"step": 62000
},
{
"epoch": 1.9892369786661541,
"grad_norm": 0.6352538466453552,
"learning_rate": 1.2994535659399327e-05,
"loss": 0.9254,
"num_input_tokens_seen": 508723200,
"step": 62100
},
{
"epoch": 1.9924402588250367,
"grad_norm": 1.6909618377685547,
"learning_rate": 1.2920251724949296e-05,
"loss": 0.8628,
"num_input_tokens_seen": 509542400,
"step": 62200
},
{
"epoch": 1.9956435389839196,
"grad_norm": 0.6590949892997742,
"learning_rate": 1.2846106667032693e-05,
"loss": 0.8509,
"num_input_tokens_seen": 510361600,
"step": 62300
},
{
"epoch": 1.9988468191428024,
"grad_norm": 2.059828042984009,
"learning_rate": 1.2772101338068649e-05,
"loss": 0.8547,
"num_input_tokens_seen": 511180800,
"step": 62400
},
{
"epoch": 2.0020500993016848,
"grad_norm": 0.8146264553070068,
"learning_rate": 1.2698236588869894e-05,
"loss": 0.8274,
"num_input_tokens_seen": 512000000,
"step": 62500
},
{
"epoch": 2.0052533794605676,
"grad_norm": 0.5894434452056885,
"learning_rate": 1.2624513268632967e-05,
"loss": 0.8213,
"num_input_tokens_seen": 512819200,
"step": 62600
},
{
"epoch": 2.0084566596194504,
"grad_norm": 1.9424681663513184,
"learning_rate": 1.2550932224928425e-05,
"loss": 0.8608,
"num_input_tokens_seen": 513638400,
"step": 62700
},
{
"epoch": 2.011659939778333,
"grad_norm": 0.6579126715660095,
"learning_rate": 1.2477494303691157e-05,
"loss": 0.836,
"num_input_tokens_seen": 514457600,
"step": 62800
},
{
"epoch": 2.0148632199372156,
"grad_norm": 0.5051004886627197,
"learning_rate": 1.2404200349210577e-05,
"loss": 0.8208,
"num_input_tokens_seen": 515276800,
"step": 62900
},
{
"epoch": 2.0180665000960984,
"grad_norm": 0.6397780179977417,
"learning_rate": 1.2331051204121009e-05,
"loss": 0.8293,
"num_input_tokens_seen": 516096000,
"step": 63000
},
{
"epoch": 2.0212697802549813,
"grad_norm": 0.7705442309379578,
"learning_rate": 1.2258047709391945e-05,
"loss": 0.8663,
"num_input_tokens_seen": 516915200,
"step": 63100
},
{
"epoch": 2.0244730604138637,
"grad_norm": 0.711100697517395,
"learning_rate": 1.218519070431836e-05,
"loss": 0.8186,
"num_input_tokens_seen": 517734400,
"step": 63200
},
{
"epoch": 2.0276763405727465,
"grad_norm": 0.6769080758094788,
"learning_rate": 1.2112481026511138e-05,
"loss": 0.8468,
"num_input_tokens_seen": 518553600,
"step": 63300
},
{
"epoch": 2.0308796207316293,
"grad_norm": 0.7686530351638794,
"learning_rate": 1.2039919511887338e-05,
"loss": 0.7955,
"num_input_tokens_seen": 519372800,
"step": 63400
},
{
"epoch": 2.0340829008905117,
"grad_norm": 0.826252281665802,
"learning_rate": 1.1967506994660685e-05,
"loss": 0.8313,
"num_input_tokens_seen": 520192000,
"step": 63500
},
{
"epoch": 2.0372861810493945,
"grad_norm": 1.5545631647109985,
"learning_rate": 1.1895244307331923e-05,
"loss": 0.8387,
"num_input_tokens_seen": 521011200,
"step": 63600
},
{
"epoch": 2.0404894612082773,
"grad_norm": 2.142545461654663,
"learning_rate": 1.1823132280679235e-05,
"loss": 0.8087,
"num_input_tokens_seen": 521830400,
"step": 63700
},
{
"epoch": 2.04369274136716,
"grad_norm": 1.7032113075256348,
"learning_rate": 1.1751171743748737e-05,
"loss": 0.8357,
"num_input_tokens_seen": 522649600,
"step": 63800
},
{
"epoch": 2.0468960215260426,
"grad_norm": 0.6579723358154297,
"learning_rate": 1.1679363523844918e-05,
"loss": 0.8435,
"num_input_tokens_seen": 523468800,
"step": 63900
},
{
"epoch": 2.0500993016849254,
"grad_norm": 0.6495528817176819,
"learning_rate": 1.1607708446521125e-05,
"loss": 0.8702,
"num_input_tokens_seen": 524288000,
"step": 64000
},
{
"epoch": 2.053302581843808,
"grad_norm": 0.5699741840362549,
"learning_rate": 1.153620733557007e-05,
"loss": 0.8436,
"num_input_tokens_seen": 525107200,
"step": 64100
},
{
"epoch": 2.0565058620026906,
"grad_norm": 0.5475245118141174,
"learning_rate": 1.1464861013014391e-05,
"loss": 0.825,
"num_input_tokens_seen": 525926400,
"step": 64200
},
{
"epoch": 2.0597091421615734,
"grad_norm": 2.3118770122528076,
"learning_rate": 1.139367029909717e-05,
"loss": 0.8469,
"num_input_tokens_seen": 526745600,
"step": 64300
},
{
"epoch": 2.0629124223204562,
"grad_norm": 0.7807962894439697,
"learning_rate": 1.1322636012272517e-05,
"loss": 0.8397,
"num_input_tokens_seen": 527564800,
"step": 64400
},
{
"epoch": 2.0661157024793386,
"grad_norm": 1.0216293334960938,
"learning_rate": 1.1251758969196147e-05,
"loss": 0.7898,
"num_input_tokens_seen": 528384000,
"step": 64500
},
{
"epoch": 2.0693189826382214,
"grad_norm": 0.7191298604011536,
"learning_rate": 1.1181039984715991e-05,
"loss": 0.8449,
"num_input_tokens_seen": 529203200,
"step": 64600
},
{
"epoch": 2.0725222627971043,
"grad_norm": 0.4787365198135376,
"learning_rate": 1.1110479871862862e-05,
"loss": 0.7879,
"num_input_tokens_seen": 530022400,
"step": 64700
},
{
"epoch": 2.075725542955987,
"grad_norm": 0.7449747323989868,
"learning_rate": 1.1040079441841065e-05,
"loss": 0.866,
"num_input_tokens_seen": 530841600,
"step": 64800
},
{
"epoch": 2.0789288231148695,
"grad_norm": 0.7580021619796753,
"learning_rate": 1.0969839504019108e-05,
"loss": 0.851,
"num_input_tokens_seen": 531660800,
"step": 64900
},
{
"epoch": 2.0821321032737523,
"grad_norm": 0.6036601662635803,
"learning_rate": 1.0899760865920355e-05,
"loss": 0.814,
"num_input_tokens_seen": 532480000,
"step": 65000
},
{
"epoch": 2.085335383432635,
"grad_norm": 0.553875207901001,
"learning_rate": 1.0829844333213766e-05,
"loss": 0.8307,
"num_input_tokens_seen": 533299200,
"step": 65100
},
{
"epoch": 2.0885386635915175,
"grad_norm": 0.6239012479782104,
"learning_rate": 1.0760090709704642e-05,
"loss": 0.8406,
"num_input_tokens_seen": 534118400,
"step": 65200
},
{
"epoch": 2.0917419437504003,
"grad_norm": 0.8101912140846252,
"learning_rate": 1.0690500797325387e-05,
"loss": 0.8263,
"num_input_tokens_seen": 534937600,
"step": 65300
},
{
"epoch": 2.094945223909283,
"grad_norm": 0.827496349811554,
"learning_rate": 1.0621075396126265e-05,
"loss": 0.7959,
"num_input_tokens_seen": 535756800,
"step": 65400
},
{
"epoch": 2.098148504068166,
"grad_norm": 0.7722252607345581,
"learning_rate": 1.055181530426621e-05,
"loss": 0.8417,
"num_input_tokens_seen": 536576000,
"step": 65500
},
{
"epoch": 2.1013517842270484,
"grad_norm": 0.8276936411857605,
"learning_rate": 1.0482721318003644e-05,
"loss": 0.8267,
"num_input_tokens_seen": 537395200,
"step": 65600
},
{
"epoch": 2.104555064385931,
"grad_norm": 0.5818492770195007,
"learning_rate": 1.0413794231687357e-05,
"loss": 0.811,
"num_input_tokens_seen": 538214400,
"step": 65700
},
{
"epoch": 2.107758344544814,
"grad_norm": 1.9946190118789673,
"learning_rate": 1.0345034837747342e-05,
"loss": 0.8376,
"num_input_tokens_seen": 539033600,
"step": 65800
},
{
"epoch": 2.1109616247036964,
"grad_norm": 0.5959033370018005,
"learning_rate": 1.0276443926685694e-05,
"loss": 0.8641,
"num_input_tokens_seen": 539852800,
"step": 65900
},
{
"epoch": 2.1141649048625792,
"grad_norm": 0.9433934092521667,
"learning_rate": 1.0208022287067509e-05,
"loss": 0.8445,
"num_input_tokens_seen": 540672000,
"step": 66000
},
{
"epoch": 2.117368185021462,
"grad_norm": 1.3814393281936646,
"learning_rate": 1.0139770705511833e-05,
"loss": 0.8783,
"num_input_tokens_seen": 541491200,
"step": 66100
},
{
"epoch": 2.120571465180345,
"grad_norm": 0.5552910566329956,
"learning_rate": 1.0071689966682623e-05,
"loss": 0.7836,
"num_input_tokens_seen": 542310400,
"step": 66200
},
{
"epoch": 2.1237747453392273,
"grad_norm": 0.6831013560295105,
"learning_rate": 1.0003780853279732e-05,
"loss": 0.8143,
"num_input_tokens_seen": 543129600,
"step": 66300
},
{
"epoch": 2.12697802549811,
"grad_norm": 1.8912497758865356,
"learning_rate": 9.936044146029855e-06,
"loss": 0.8582,
"num_input_tokens_seen": 543948800,
"step": 66400
},
{
"epoch": 2.130181305656993,
"grad_norm": 0.6759600639343262,
"learning_rate": 9.868480623677643e-06,
"loss": 0.8295,
"num_input_tokens_seen": 544768000,
"step": 66500
},
{
"epoch": 2.1333845858158753,
"grad_norm": 0.6555814146995544,
"learning_rate": 9.801091062976665e-06,
"loss": 0.7856,
"num_input_tokens_seen": 545587200,
"step": 66600
},
{
"epoch": 2.136587865974758,
"grad_norm": 0.7342298626899719,
"learning_rate": 9.733876238680531e-06,
"loss": 0.8144,
"num_input_tokens_seen": 546406400,
"step": 66700
},
{
"epoch": 2.139791146133641,
"grad_norm": 1.6135506629943848,
"learning_rate": 9.666836923533987e-06,
"loss": 0.7658,
"num_input_tokens_seen": 547225600,
"step": 66800
},
{
"epoch": 2.1429944262925233,
"grad_norm": 0.6479013562202454,
"learning_rate": 9.599973888263972e-06,
"loss": 0.7818,
"num_input_tokens_seen": 548044800,
"step": 66900
},
{
"epoch": 2.146197706451406,
"grad_norm": 0.8639338612556458,
"learning_rate": 9.533287901570843e-06,
"loss": 0.8259,
"num_input_tokens_seen": 548864000,
"step": 67000
},
{
"epoch": 2.149400986610289,
"grad_norm": 0.852070152759552,
"learning_rate": 9.466779730119449e-06,
"loss": 0.84,
"num_input_tokens_seen": 549683200,
"step": 67100
},
{
"epoch": 2.152604266769172,
"grad_norm": 0.8585788607597351,
"learning_rate": 9.400450138530394e-06,
"loss": 0.8595,
"num_input_tokens_seen": 550502400,
"step": 67200
},
{
"epoch": 2.155807546928054,
"grad_norm": 2.652194023132324,
"learning_rate": 9.334299889371217e-06,
"loss": 0.8404,
"num_input_tokens_seen": 551321600,
"step": 67300
},
{
"epoch": 2.159010827086937,
"grad_norm": 0.6588045954704285,
"learning_rate": 9.268329743147583e-06,
"loss": 0.7933,
"num_input_tokens_seen": 552140800,
"step": 67400
},
{
"epoch": 2.16221410724582,
"grad_norm": 2.807159423828125,
"learning_rate": 9.202540458294623e-06,
"loss": 0.8066,
"num_input_tokens_seen": 552960000,
"step": 67500
},
{
"epoch": 2.1654173874047022,
"grad_norm": 0.7351047396659851,
"learning_rate": 9.136932791168132e-06,
"loss": 0.8831,
"num_input_tokens_seen": 553779200,
"step": 67600
},
{
"epoch": 2.168620667563585,
"grad_norm": 0.6064037084579468,
"learning_rate": 9.071507496035943e-06,
"loss": 0.7602,
"num_input_tokens_seen": 554598400,
"step": 67700
},
{
"epoch": 2.171823947722468,
"grad_norm": 0.6641263365745544,
"learning_rate": 9.006265325069197e-06,
"loss": 0.7984,
"num_input_tokens_seen": 555417600,
"step": 67800
},
{
"epoch": 2.1750272278813503,
"grad_norm": 0.6006192564964294,
"learning_rate": 8.941207028333737e-06,
"loss": 0.7831,
"num_input_tokens_seen": 556236800,
"step": 67900
},
{
"epoch": 2.178230508040233,
"grad_norm": 0.6849149465560913,
"learning_rate": 8.876333353781468e-06,
"loss": 0.829,
"num_input_tokens_seen": 557056000,
"step": 68000
},
{
"epoch": 2.181433788199116,
"grad_norm": 0.7569016218185425,
"learning_rate": 8.811645047241767e-06,
"loss": 0.8623,
"num_input_tokens_seen": 557875200,
"step": 68100
},
{
"epoch": 2.1846370683579988,
"grad_norm": 0.7035521268844604,
"learning_rate": 8.74714285241289e-06,
"loss": 0.8444,
"num_input_tokens_seen": 558694400,
"step": 68200
},
{
"epoch": 2.187840348516881,
"grad_norm": 0.7252819538116455,
"learning_rate": 8.682827510853426e-06,
"loss": 0.8287,
"num_input_tokens_seen": 559513600,
"step": 68300
},
{
"epoch": 2.191043628675764,
"grad_norm": 0.5455666780471802,
"learning_rate": 8.618699761973792e-06,
"loss": 0.7785,
"num_input_tokens_seen": 560332800,
"step": 68400
},
{
"epoch": 2.194246908834647,
"grad_norm": 0.8008429408073425,
"learning_rate": 8.554760343027724e-06,
"loss": 0.8595,
"num_input_tokens_seen": 561152000,
"step": 68500
},
{
"epoch": 2.197450188993529,
"grad_norm": 0.755208432674408,
"learning_rate": 8.491009989103796e-06,
"loss": 0.8538,
"num_input_tokens_seen": 561971200,
"step": 68600
},
{
"epoch": 2.200653469152412,
"grad_norm": 0.5776748657226562,
"learning_rate": 8.427449433116952e-06,
"loss": 0.8333,
"num_input_tokens_seen": 562790400,
"step": 68700
},
{
"epoch": 2.203856749311295,
"grad_norm": 0.6535948514938354,
"learning_rate": 8.364079405800105e-06,
"loss": 0.8281,
"num_input_tokens_seen": 563609600,
"step": 68800
},
{
"epoch": 2.2070600294701777,
"grad_norm": 0.5949485898017883,
"learning_rate": 8.30090063569573e-06,
"loss": 0.7887,
"num_input_tokens_seen": 564428800,
"step": 68900
},
{
"epoch": 2.21026330962906,
"grad_norm": 3.0284650325775146,
"learning_rate": 8.237913849147497e-06,
"loss": 0.8451,
"num_input_tokens_seen": 565248000,
"step": 69000
},
{
"epoch": 2.213466589787943,
"grad_norm": 0.5593298673629761,
"learning_rate": 8.1751197702919e-06,
"loss": 0.8596,
"num_input_tokens_seen": 566067200,
"step": 69100
},
{
"epoch": 2.2166698699468257,
"grad_norm": 0.670230507850647,
"learning_rate": 8.112519121049942e-06,
"loss": 0.8584,
"num_input_tokens_seen": 566886400,
"step": 69200
},
{
"epoch": 2.219873150105708,
"grad_norm": 1.34910249710083,
"learning_rate": 8.050112621118822e-06,
"loss": 0.8518,
"num_input_tokens_seen": 567705600,
"step": 69300
},
{
"epoch": 2.223076430264591,
"grad_norm": 0.6535902619361877,
"learning_rate": 7.987900987963695e-06,
"loss": 0.8544,
"num_input_tokens_seen": 568524800,
"step": 69400
},
{
"epoch": 2.2262797104234737,
"grad_norm": 0.594032883644104,
"learning_rate": 7.925884936809396e-06,
"loss": 0.8395,
"num_input_tokens_seen": 569344000,
"step": 69500
},
{
"epoch": 2.2294829905823565,
"grad_norm": 0.6679059863090515,
"learning_rate": 7.864065180632233e-06,
"loss": 0.8681,
"num_input_tokens_seen": 570163200,
"step": 69600
},
{
"epoch": 2.232686270741239,
"grad_norm": 0.5853981375694275,
"learning_rate": 7.802442430151757e-06,
"loss": 0.7735,
"num_input_tokens_seen": 570982400,
"step": 69700
},
{
"epoch": 2.2358895509001218,
"grad_norm": 1.4077626466751099,
"learning_rate": 7.741017393822628e-06,
"loss": 0.7853,
"num_input_tokens_seen": 571801600,
"step": 69800
},
{
"epoch": 2.2390928310590046,
"grad_norm": 0.6583539247512817,
"learning_rate": 7.679790777826459e-06,
"loss": 0.8403,
"num_input_tokens_seen": 572620800,
"step": 69900
},
{
"epoch": 2.242296111217887,
"grad_norm": 0.8946901559829712,
"learning_rate": 7.618763286063698e-06,
"loss": 0.8336,
"num_input_tokens_seen": 573440000,
"step": 70000
},
{
"epoch": 2.24549939137677,
"grad_norm": 0.7540560364723206,
"learning_rate": 7.55793562014554e-06,
"loss": 0.7682,
"num_input_tokens_seen": 574259200,
"step": 70100
},
{
"epoch": 2.2487026715356526,
"grad_norm": 0.7601240873336792,
"learning_rate": 7.497308479385831e-06,
"loss": 0.8367,
"num_input_tokens_seen": 575078400,
"step": 70200
},
{
"epoch": 2.2519059516945354,
"grad_norm": 0.7198605537414551,
"learning_rate": 7.43688256079306e-06,
"loss": 0.8119,
"num_input_tokens_seen": 575897600,
"step": 70300
},
{
"epoch": 2.255109231853418,
"grad_norm": 0.7405291199684143,
"learning_rate": 7.376658559062349e-06,
"loss": 0.8231,
"num_input_tokens_seen": 576716800,
"step": 70400
},
{
"epoch": 2.2583125120123007,
"grad_norm": 0.6844334602355957,
"learning_rate": 7.31663716656745e-06,
"loss": 0.852,
"num_input_tokens_seen": 577536000,
"step": 70500
},
{
"epoch": 2.2615157921711835,
"grad_norm": 3.182279348373413,
"learning_rate": 7.256819073352775e-06,
"loss": 0.82,
"num_input_tokens_seen": 578355200,
"step": 70600
},
{
"epoch": 2.264719072330066,
"grad_norm": 0.7010332345962524,
"learning_rate": 7.197204967125498e-06,
"loss": 0.8417,
"num_input_tokens_seen": 579174400,
"step": 70700
},
{
"epoch": 2.2679223524889487,
"grad_norm": 3.276526927947998,
"learning_rate": 7.137795533247604e-06,
"loss": 0.8252,
"num_input_tokens_seen": 579993600,
"step": 70800
},
{
"epoch": 2.2711256326478315,
"grad_norm": 0.6692455410957336,
"learning_rate": 7.078591454728056e-06,
"loss": 0.8195,
"num_input_tokens_seen": 580812800,
"step": 70900
},
{
"epoch": 2.274328912806714,
"grad_norm": 0.6837947368621826,
"learning_rate": 7.019593412214914e-06,
"loss": 0.8012,
"num_input_tokens_seen": 581632000,
"step": 71000
},
{
"epoch": 2.2775321929655967,
"grad_norm": 0.8453261256217957,
"learning_rate": 6.960802083987503e-06,
"loss": 0.8097,
"num_input_tokens_seen": 582451200,
"step": 71100
},
{
"epoch": 2.2807354731244796,
"grad_norm": 0.7615090608596802,
"learning_rate": 6.902218145948647e-06,
"loss": 0.8216,
"num_input_tokens_seen": 583270400,
"step": 71200
},
{
"epoch": 2.283938753283362,
"grad_norm": 2.4880526065826416,
"learning_rate": 6.8438422716168595e-06,
"loss": 0.829,
"num_input_tokens_seen": 584089600,
"step": 71300
},
{
"epoch": 2.2871420334422448,
"grad_norm": 2.184436798095703,
"learning_rate": 6.785675132118638e-06,
"loss": 0.8557,
"num_input_tokens_seen": 584908800,
"step": 71400
},
{
"epoch": 2.2903453136011276,
"grad_norm": 0.6513957977294922,
"learning_rate": 6.72771739618073e-06,
"loss": 0.8199,
"num_input_tokens_seen": 585728000,
"step": 71500
},
{
"epoch": 2.2935485937600104,
"grad_norm": 2.187042713165283,
"learning_rate": 6.6699697301224214e-06,
"loss": 0.876,
"num_input_tokens_seen": 586547200,
"step": 71600
},
{
"epoch": 2.296751873918893,
"grad_norm": 0.6848201751708984,
"learning_rate": 6.612432797847937e-06,
"loss": 0.8013,
"num_input_tokens_seen": 587366400,
"step": 71700
},
{
"epoch": 2.2999551540777756,
"grad_norm": 0.9538524150848389,
"learning_rate": 6.55510726083873e-06,
"loss": 0.7922,
"num_input_tokens_seen": 588185600,
"step": 71800
},
{
"epoch": 2.3031584342366584,
"grad_norm": 0.6234622597694397,
"learning_rate": 6.4979937781459586e-06,
"loss": 0.7617,
"num_input_tokens_seen": 589004800,
"step": 71900
},
{
"epoch": 2.306361714395541,
"grad_norm": 0.7952730655670166,
"learning_rate": 6.441093006382831e-06,
"loss": 0.8744,
"num_input_tokens_seen": 589824000,
"step": 72000
},
{
"epoch": 2.3095649945544237,
"grad_norm": 0.6471823453903198,
"learning_rate": 6.384405599717125e-06,
"loss": 0.7952,
"num_input_tokens_seen": 590643200,
"step": 72100
},
{
"epoch": 2.3127682747133065,
"grad_norm": 0.713498592376709,
"learning_rate": 6.327932209863618e-06,
"loss": 0.817,
"num_input_tokens_seen": 591462400,
"step": 72200
},
{
"epoch": 2.3159715548721893,
"grad_norm": 0.8223375678062439,
"learning_rate": 6.271673486076629e-06,
"loss": 0.8127,
"num_input_tokens_seen": 592281600,
"step": 72300
},
{
"epoch": 2.3191748350310717,
"grad_norm": 2.696056842803955,
"learning_rate": 6.215630075142523e-06,
"loss": 0.8191,
"num_input_tokens_seen": 593100800,
"step": 72400
},
{
"epoch": 2.3223781151899545,
"grad_norm": 0.6731551885604858,
"learning_rate": 6.159802621372279e-06,
"loss": 0.831,
"num_input_tokens_seen": 593920000,
"step": 72500
},
{
"epoch": 2.3255813953488373,
"grad_norm": 0.6898087859153748,
"learning_rate": 6.1041917665941275e-06,
"loss": 0.8249,
"num_input_tokens_seen": 594739200,
"step": 72600
},
{
"epoch": 2.3287846755077197,
"grad_norm": 0.6532519459724426,
"learning_rate": 6.048798150146112e-06,
"loss": 0.7416,
"num_input_tokens_seen": 595558400,
"step": 72700
},
{
"epoch": 2.3319879556666026,
"grad_norm": 0.6760110259056091,
"learning_rate": 5.993622408868788e-06,
"loss": 0.8451,
"num_input_tokens_seen": 596377600,
"step": 72800
},
{
"epoch": 2.3351912358254854,
"grad_norm": 2.732374668121338,
"learning_rate": 5.9386651770978516e-06,
"loss": 0.8654,
"num_input_tokens_seen": 597196800,
"step": 72900
},
{
"epoch": 2.338394515984368,
"grad_norm": 0.6297926306724548,
"learning_rate": 5.8839270866568816e-06,
"loss": 0.8397,
"num_input_tokens_seen": 598016000,
"step": 73000
},
{
"epoch": 2.3415977961432506,
"grad_norm": 0.5178629755973816,
"learning_rate": 5.829408766850078e-06,
"loss": 0.833,
"num_input_tokens_seen": 598835200,
"step": 73100
},
{
"epoch": 2.3448010763021334,
"grad_norm": 0.5522879958152771,
"learning_rate": 5.7751108444550066e-06,
"loss": 0.8174,
"num_input_tokens_seen": 599654400,
"step": 73200
},
{
"epoch": 2.3480043564610162,
"grad_norm": 0.6307721734046936,
"learning_rate": 5.7210339437154175e-06,
"loss": 0.7809,
"num_input_tokens_seen": 600473600,
"step": 73300
},
{
"epoch": 2.3512076366198986,
"grad_norm": 0.6830965876579285,
"learning_rate": 5.667178686334037e-06,
"loss": 0.8243,
"num_input_tokens_seen": 601292800,
"step": 73400
},
{
"epoch": 2.3544109167787814,
"grad_norm": 2.0725910663604736,
"learning_rate": 5.613545691465438e-06,
"loss": 0.7868,
"num_input_tokens_seen": 602112000,
"step": 73500
},
{
"epoch": 2.3576141969376643,
"grad_norm": 0.994819700717926,
"learning_rate": 5.560135575708927e-06,
"loss": 0.8176,
"num_input_tokens_seen": 602931200,
"step": 73600
},
{
"epoch": 2.360817477096547,
"grad_norm": 0.7025684714317322,
"learning_rate": 5.506948953101454e-06,
"loss": 0.8417,
"num_input_tokens_seen": 603750400,
"step": 73700
},
{
"epoch": 2.3640207572554295,
"grad_norm": 0.6975109577178955,
"learning_rate": 5.45398643511055e-06,
"loss": 0.8552,
"num_input_tokens_seen": 604569600,
"step": 73800
},
{
"epoch": 2.3672240374143123,
"grad_norm": 0.6180407404899597,
"learning_rate": 5.401248630627282e-06,
"loss": 0.8423,
"num_input_tokens_seen": 605388800,
"step": 73900
},
{
"epoch": 2.370427317573195,
"grad_norm": 0.8194453716278076,
"learning_rate": 5.3487361459592626e-06,
"loss": 0.8278,
"num_input_tokens_seen": 606208000,
"step": 74000
},
{
"epoch": 2.3736305977320775,
"grad_norm": 0.6039137244224548,
"learning_rate": 5.296449584823707e-06,
"loss": 0.8354,
"num_input_tokens_seen": 607027200,
"step": 74100
},
{
"epoch": 2.3768338778909603,
"grad_norm": 0.6407757997512817,
"learning_rate": 5.244389548340456e-06,
"loss": 0.8292,
"num_input_tokens_seen": 607846400,
"step": 74200
},
{
"epoch": 2.380037158049843,
"grad_norm": 1.9735205173492432,
"learning_rate": 5.19255663502507e-06,
"loss": 0.8604,
"num_input_tokens_seen": 608665600,
"step": 74300
},
{
"epoch": 2.3832404382087256,
"grad_norm": 0.7297560572624207,
"learning_rate": 5.1409514407819745e-06,
"loss": 0.8464,
"num_input_tokens_seen": 609484800,
"step": 74400
},
{
"epoch": 2.3864437183676084,
"grad_norm": 0.641272246837616,
"learning_rate": 5.089574558897564e-06,
"loss": 0.8711,
"num_input_tokens_seen": 610304000,
"step": 74500
},
{
"epoch": 2.389646998526491,
"grad_norm": 0.5732747316360474,
"learning_rate": 5.038426580033431e-06,
"loss": 0.8357,
"num_input_tokens_seen": 611123200,
"step": 74600
},
{
"epoch": 2.3928502786853736,
"grad_norm": 0.7175111770629883,
"learning_rate": 4.98750809221955e-06,
"loss": 0.8782,
"num_input_tokens_seen": 611942400,
"step": 74700
},
{
"epoch": 2.3960535588442564,
"grad_norm": 0.6939539909362793,
"learning_rate": 4.936819680847499e-06,
"loss": 0.8051,
"num_input_tokens_seen": 612761600,
"step": 74800
},
{
"epoch": 2.3992568390031392,
"grad_norm": 0.9897929430007935,
"learning_rate": 4.886361928663779e-06,
"loss": 0.8208,
"num_input_tokens_seen": 613580800,
"step": 74900
},
{
"epoch": 2.402460119162022,
"grad_norm": 1.3492214679718018,
"learning_rate": 4.836135415763054e-06,
"loss": 0.8081,
"num_input_tokens_seen": 614400000,
"step": 75000
},
{
"epoch": 2.4056633993209044,
"grad_norm": 0.6165256500244141,
"learning_rate": 4.786140719581539e-06,
"loss": 0.8612,
"num_input_tokens_seen": 615219200,
"step": 75100
},
{
"epoch": 2.4088666794797873,
"grad_norm": 0.7315238118171692,
"learning_rate": 4.73637841489033e-06,
"loss": 0.8201,
"num_input_tokens_seen": 616038400,
"step": 75200
},
{
"epoch": 2.41206995963867,
"grad_norm": 0.5693472027778625,
"learning_rate": 4.686849073788782e-06,
"loss": 0.8319,
"num_input_tokens_seen": 616857600,
"step": 75300
},
{
"epoch": 2.4152732397975525,
"grad_norm": 1.28626549243927,
"learning_rate": 4.637553265697978e-06,
"loss": 0.8012,
"num_input_tokens_seen": 617676800,
"step": 75400
},
{
"epoch": 2.4184765199564353,
"grad_norm": 3.020348072052002,
"learning_rate": 4.5884915573541326e-06,
"loss": 0.8216,
"num_input_tokens_seen": 618496000,
"step": 75500
},
{
"epoch": 2.421679800115318,
"grad_norm": 1.7923747301101685,
"learning_rate": 4.539664512802125e-06,
"loss": 0.8269,
"num_input_tokens_seen": 619315200,
"step": 75600
},
{
"epoch": 2.424883080274201,
"grad_norm": 0.6749047636985779,
"learning_rate": 4.491072693388957e-06,
"loss": 0.7949,
"num_input_tokens_seen": 620134400,
"step": 75700
},
{
"epoch": 2.4280863604330833,
"grad_norm": 0.8918429613113403,
"learning_rate": 4.442716657757354e-06,
"loss": 0.8153,
"num_input_tokens_seen": 620953600,
"step": 75800
},
{
"epoch": 2.431289640591966,
"grad_norm": 0.8165135383605957,
"learning_rate": 4.3945969618393255e-06,
"loss": 0.8063,
"num_input_tokens_seen": 621772800,
"step": 75900
},
{
"epoch": 2.434492920750849,
"grad_norm": 2.7509946823120117,
"learning_rate": 4.346714158849744e-06,
"loss": 0.7779,
"num_input_tokens_seen": 622592000,
"step": 76000
},
{
"epoch": 2.4376962009097314,
"grad_norm": 1.2128119468688965,
"learning_rate": 4.299068799280032e-06,
"loss": 0.8322,
"num_input_tokens_seen": 623411200,
"step": 76100
},
{
"epoch": 2.440899481068614,
"grad_norm": 1.1851086616516113,
"learning_rate": 4.251661430891787e-06,
"loss": 0.8294,
"num_input_tokens_seen": 624230400,
"step": 76200
},
{
"epoch": 2.444102761227497,
"grad_norm": 0.7874124646186829,
"learning_rate": 4.20449259871053e-06,
"loss": 0.819,
"num_input_tokens_seen": 625049600,
"step": 76300
},
{
"epoch": 2.44730604138638,
"grad_norm": 0.6558551788330078,
"learning_rate": 4.157562845019405e-06,
"loss": 0.7969,
"num_input_tokens_seen": 625868800,
"step": 76400
},
{
"epoch": 2.4505093215452622,
"grad_norm": 0.7723847031593323,
"learning_rate": 4.1108727093529644e-06,
"loss": 0.8516,
"num_input_tokens_seen": 626688000,
"step": 76500
},
{
"epoch": 2.453712601704145,
"grad_norm": 0.6779108047485352,
"learning_rate": 4.064422728490946e-06,
"loss": 0.8471,
"num_input_tokens_seen": 627507200,
"step": 76600
},
{
"epoch": 2.456915881863028,
"grad_norm": 0.5954208970069885,
"learning_rate": 4.018213436452117e-06,
"loss": 0.84,
"num_input_tokens_seen": 628326400,
"step": 76700
},
{
"epoch": 2.4601191620219103,
"grad_norm": 2.6484439373016357,
"learning_rate": 3.972245364488136e-06,
"loss": 0.8224,
"num_input_tokens_seen": 629145600,
"step": 76800
},
{
"epoch": 2.463322442180793,
"grad_norm": 0.6489027142524719,
"learning_rate": 3.926519041077445e-06,
"loss": 0.8476,
"num_input_tokens_seen": 629964800,
"step": 76900
},
{
"epoch": 2.466525722339676,
"grad_norm": 2.0896570682525635,
"learning_rate": 3.8810349919191825e-06,
"loss": 0.8256,
"num_input_tokens_seen": 630784000,
"step": 77000
},
{
"epoch": 2.4697290024985588,
"grad_norm": 0.8174818158149719,
"learning_rate": 3.835793739927151e-06,
"loss": 0.8493,
"num_input_tokens_seen": 631603200,
"step": 77100
},
{
"epoch": 2.472932282657441,
"grad_norm": 0.7576190829277039,
"learning_rate": 3.7907958052237875e-06,
"loss": 0.8275,
"num_input_tokens_seen": 632422400,
"step": 77200
},
{
"epoch": 2.476135562816324,
"grad_norm": 1.7763944864273071,
"learning_rate": 3.746041705134215e-06,
"loss": 0.8628,
"num_input_tokens_seen": 633241600,
"step": 77300
},
{
"epoch": 2.479338842975207,
"grad_norm": 0.8131124973297119,
"learning_rate": 3.7015319541802708e-06,
"loss": 0.8246,
"num_input_tokens_seen": 634060800,
"step": 77400
},
{
"epoch": 2.482542123134089,
"grad_norm": 0.9916465282440186,
"learning_rate": 3.657267064074607e-06,
"loss": 0.806,
"num_input_tokens_seen": 634880000,
"step": 77500
},
{
"epoch": 2.485745403292972,
"grad_norm": 1.6239954233169556,
"learning_rate": 3.613247543714779e-06,
"loss": 0.8068,
"num_input_tokens_seen": 635699200,
"step": 77600
},
{
"epoch": 2.488948683451855,
"grad_norm": 1.0215014219284058,
"learning_rate": 3.5694738991774197e-06,
"loss": 0.7704,
"num_input_tokens_seen": 636518400,
"step": 77700
},
{
"epoch": 2.492151963610737,
"grad_norm": 0.6939218044281006,
"learning_rate": 3.5259466337124293e-06,
"loss": 0.8625,
"num_input_tokens_seen": 637337600,
"step": 77800
},
{
"epoch": 2.49535524376962,
"grad_norm": 0.7442044615745544,
"learning_rate": 3.4826662477371624e-06,
"loss": 0.8093,
"num_input_tokens_seen": 638156800,
"step": 77900
},
{
"epoch": 2.498558523928503,
"grad_norm": 0.5725979208946228,
"learning_rate": 3.4396332388307057e-06,
"loss": 0.8533,
"num_input_tokens_seen": 638976000,
"step": 78000
},
{
"epoch": 2.5017618040873852,
"grad_norm": 2.239358425140381,
"learning_rate": 3.3968481017281173e-06,
"loss": 0.8254,
"num_input_tokens_seen": 639795200,
"step": 78100
},
{
"epoch": 2.504965084246268,
"grad_norm": 0.6777194142341614,
"learning_rate": 3.3543113283147687e-06,
"loss": 0.8311,
"num_input_tokens_seen": 640614400,
"step": 78200
},
{
"epoch": 2.508168364405151,
"grad_norm": 0.9692057371139526,
"learning_rate": 3.3120234076206987e-06,
"loss": 0.8285,
"num_input_tokens_seen": 641433600,
"step": 78300
},
{
"epoch": 2.5113716445640337,
"grad_norm": 0.8157410621643066,
"learning_rate": 3.2699848258149617e-06,
"loss": 0.8276,
"num_input_tokens_seen": 642252800,
"step": 78400
},
{
"epoch": 2.514574924722916,
"grad_norm": 1.9688010215759277,
"learning_rate": 3.228196066200051e-06,
"loss": 0.7989,
"num_input_tokens_seen": 643072000,
"step": 78500
},
{
"epoch": 2.517778204881799,
"grad_norm": 2.142247200012207,
"learning_rate": 3.186657609206353e-06,
"loss": 0.8165,
"num_input_tokens_seen": 643891200,
"step": 78600
},
{
"epoch": 2.5209814850406818,
"grad_norm": 0.7529670596122742,
"learning_rate": 3.1453699323866047e-06,
"loss": 0.8476,
"num_input_tokens_seen": 644710400,
"step": 78700
},
{
"epoch": 2.524184765199564,
"grad_norm": 0.5978514552116394,
"learning_rate": 3.1043335104104233e-06,
"loss": 0.8386,
"num_input_tokens_seen": 645529600,
"step": 78800
},
{
"epoch": 2.527388045358447,
"grad_norm": 0.7615718841552734,
"learning_rate": 3.0635488150588338e-06,
"loss": 0.8198,
"num_input_tokens_seen": 646348800,
"step": 78900
},
{
"epoch": 2.53059132551733,
"grad_norm": 0.7568325400352478,
"learning_rate": 3.0230163152188463e-06,
"loss": 0.8364,
"num_input_tokens_seen": 647168000,
"step": 79000
},
{
"epoch": 2.5337946056762126,
"grad_norm": 0.5773870944976807,
"learning_rate": 2.9827364768780814e-06,
"loss": 0.7922,
"num_input_tokens_seen": 647987200,
"step": 79100
},
{
"epoch": 2.536997885835095,
"grad_norm": 4.734196662902832,
"learning_rate": 2.942709763119386e-06,
"loss": 0.7829,
"num_input_tokens_seen": 648806400,
"step": 79200
},
{
"epoch": 2.540201165993978,
"grad_norm": 0.7763670682907104,
"learning_rate": 2.9029366341155356e-06,
"loss": 0.8196,
"num_input_tokens_seen": 649625600,
"step": 79300
},
{
"epoch": 2.5434044461528607,
"grad_norm": 0.6776308417320251,
"learning_rate": 2.863417547123934e-06,
"loss": 0.788,
"num_input_tokens_seen": 650444800,
"step": 79400
},
{
"epoch": 2.546607726311743,
"grad_norm": 0.7068803906440735,
"learning_rate": 2.8241529564813434e-06,
"loss": 0.8413,
"num_input_tokens_seen": 651264000,
"step": 79500
},
{
"epoch": 2.549811006470626,
"grad_norm": 1.1894068717956543,
"learning_rate": 2.7851433135986843e-06,
"loss": 0.851,
"num_input_tokens_seen": 652083200,
"step": 79600
},
{
"epoch": 2.5530142866295087,
"grad_norm": 1.9698837995529175,
"learning_rate": 2.7463890669558263e-06,
"loss": 0.8379,
"num_input_tokens_seen": 652902400,
"step": 79700
},
{
"epoch": 2.5562175667883915,
"grad_norm": 1.8066941499710083,
"learning_rate": 2.707890662096452e-06,
"loss": 0.7906,
"num_input_tokens_seen": 653721600,
"step": 79800
},
{
"epoch": 2.559420846947274,
"grad_norm": 0.824046790599823,
"learning_rate": 2.6696485416228987e-06,
"loss": 0.8011,
"num_input_tokens_seen": 654540800,
"step": 79900
},
{
"epoch": 2.5626241271061567,
"grad_norm": 0.7096015214920044,
"learning_rate": 2.6316631451911213e-06,
"loss": 0.8328,
"num_input_tokens_seen": 655360000,
"step": 80000
},
{
"epoch": 2.5658274072650396,
"grad_norm": 0.5634686350822449,
"learning_rate": 2.593934909505602e-06,
"loss": 0.8896,
"num_input_tokens_seen": 656179200,
"step": 80100
},
{
"epoch": 2.569030687423922,
"grad_norm": 0.7022582292556763,
"learning_rate": 2.5564642683143263e-06,
"loss": 0.8405,
"num_input_tokens_seen": 656998400,
"step": 80200
},
{
"epoch": 2.5722339675828048,
"grad_norm": 0.010020343586802483,
"learning_rate": 2.51925165240382e-06,
"loss": 0.8639,
"num_input_tokens_seen": 657817600,
"step": 80300
},
{
"epoch": 2.5754372477416876,
"grad_norm": 0.7010151147842407,
"learning_rate": 2.482297489594182e-06,
"loss": 0.813,
"num_input_tokens_seen": 658636800,
"step": 80400
},
{
"epoch": 2.5786405279005704,
"grad_norm": 1.0606889724731445,
"learning_rate": 2.4456022047341653e-06,
"loss": 0.8494,
"num_input_tokens_seen": 659456000,
"step": 80500
},
{
"epoch": 2.581843808059453,
"grad_norm": 0.5736305713653564,
"learning_rate": 2.4091662196963014e-06,
"loss": 0.8748,
"num_input_tokens_seen": 660275200,
"step": 80600
},
{
"epoch": 2.5850470882183356,
"grad_norm": 0.6299107074737549,
"learning_rate": 2.3729899533720485e-06,
"loss": 0.8254,
"num_input_tokens_seen": 661094400,
"step": 80700
},
{
"epoch": 2.588250368377218,
"grad_norm": 0.8091995120048523,
"learning_rate": 2.3370738216669574e-06,
"loss": 0.8373,
"num_input_tokens_seen": 661913600,
"step": 80800
},
{
"epoch": 2.591453648536101,
"grad_norm": 0.7887117862701416,
"learning_rate": 2.3014182374959116e-06,
"loss": 0.7675,
"num_input_tokens_seen": 662732800,
"step": 80900
},
{
"epoch": 2.5946569286949837,
"grad_norm": 0.7341217994689941,
"learning_rate": 2.2660236107783783e-06,
"loss": 0.8264,
"num_input_tokens_seen": 663552000,
"step": 81000
},
{
"epoch": 2.5978602088538665,
"grad_norm": 0.7887162566184998,
"learning_rate": 2.230890348433684e-06,
"loss": 0.8579,
"num_input_tokens_seen": 664371200,
"step": 81100
},
{
"epoch": 2.6010634890127493,
"grad_norm": 0.8627157807350159,
"learning_rate": 2.1960188543763526e-06,
"loss": 0.8412,
"num_input_tokens_seen": 665190400,
"step": 81200
},
{
"epoch": 2.6042667691716317,
"grad_norm": 2.6676676273345947,
"learning_rate": 2.161409529511438e-06,
"loss": 0.7985,
"num_input_tokens_seen": 666009600,
"step": 81300
},
{
"epoch": 2.6074700493305145,
"grad_norm": 0.6035804748535156,
"learning_rate": 2.127062771729929e-06,
"loss": 0.8033,
"num_input_tokens_seen": 666828800,
"step": 81400
},
{
"epoch": 2.610673329489397,
"grad_norm": 2.14854097366333,
"learning_rate": 2.092978975904189e-06,
"loss": 0.8538,
"num_input_tokens_seen": 667648000,
"step": 81500
},
{
"epoch": 2.6138766096482797,
"grad_norm": 1.651636004447937,
"learning_rate": 2.059158533883393e-06,
"loss": 0.8805,
"num_input_tokens_seen": 668467200,
"step": 81600
},
{
"epoch": 2.6170798898071626,
"grad_norm": 2.1014175415039062,
"learning_rate": 2.025601834489038e-06,
"loss": 0.8837,
"num_input_tokens_seen": 669286400,
"step": 81700
},
{
"epoch": 2.6202831699660454,
"grad_norm": 0.741468071937561,
"learning_rate": 1.9923092635104557e-06,
"loss": 0.7892,
"num_input_tokens_seen": 670105600,
"step": 81800
},
{
"epoch": 2.6234864501249278,
"grad_norm": 1.3246105909347534,
"learning_rate": 1.9592812037003918e-06,
"loss": 0.774,
"num_input_tokens_seen": 670924800,
"step": 81900
},
{
"epoch": 2.6266897302838106,
"grad_norm": 0.6697006225585938,
"learning_rate": 1.9265180347706053e-06,
"loss": 0.8393,
"num_input_tokens_seen": 671744000,
"step": 82000
},
{
"epoch": 2.6298930104426934,
"grad_norm": 0.5421914458274841,
"learning_rate": 1.894020133387503e-06,
"loss": 0.8398,
"num_input_tokens_seen": 672563200,
"step": 82100
},
{
"epoch": 2.633096290601576,
"grad_norm": 2.6112563610076904,
"learning_rate": 1.8617878731678e-06,
"loss": 0.8031,
"num_input_tokens_seen": 673382400,
"step": 82200
},
{
"epoch": 2.6362995707604586,
"grad_norm": 0.7507239580154419,
"learning_rate": 1.8298216246742329e-06,
"loss": 0.831,
"num_input_tokens_seen": 674201600,
"step": 82300
},
{
"epoch": 2.6395028509193414,
"grad_norm": 2.156158685684204,
"learning_rate": 1.798121755411289e-06,
"loss": 0.8778,
"num_input_tokens_seen": 675020800,
"step": 82400
},
{
"epoch": 2.6427061310782243,
"grad_norm": 0.5693337917327881,
"learning_rate": 1.7666886298210006e-06,
"loss": 0.7904,
"num_input_tokens_seen": 675840000,
"step": 82500
},
{
"epoch": 2.6459094112371067,
"grad_norm": 0.9597682356834412,
"learning_rate": 1.735522609278742e-06,
"loss": 0.8547,
"num_input_tokens_seen": 676659200,
"step": 82600
},
{
"epoch": 2.6491126913959895,
"grad_norm": 0.8956586122512817,
"learning_rate": 1.7046240520890655e-06,
"loss": 0.8395,
"num_input_tokens_seen": 677478400,
"step": 82700
},
{
"epoch": 2.6523159715548723,
"grad_norm": 0.918878436088562,
"learning_rate": 1.6739933134816117e-06,
"loss": 0.8106,
"num_input_tokens_seen": 678297600,
"step": 82800
},
{
"epoch": 2.6555192517137547,
"grad_norm": 0.6460690498352051,
"learning_rate": 1.6436307456069832e-06,
"loss": 0.8427,
"num_input_tokens_seen": 679116800,
"step": 82900
},
{
"epoch": 2.6587225318726375,
"grad_norm": 0.7876623868942261,
"learning_rate": 1.6135366975327442e-06,
"loss": 0.8306,
"num_input_tokens_seen": 679936000,
"step": 83000
},
{
"epoch": 2.6619258120315203,
"grad_norm": 0.7109478712081909,
"learning_rate": 1.5837115152393695e-06,
"loss": 0.8785,
"num_input_tokens_seen": 680755200,
"step": 83100
},
{
"epoch": 2.665129092190403,
"grad_norm": 0.6864702701568604,
"learning_rate": 1.5541555416162784e-06,
"loss": 0.7719,
"num_input_tokens_seen": 681574400,
"step": 83200
},
{
"epoch": 2.6683323723492856,
"grad_norm": 0.5490867495536804,
"learning_rate": 1.5248691164579054e-06,
"loss": 0.7945,
"num_input_tokens_seen": 682393600,
"step": 83300
},
{
"epoch": 2.6715356525081684,
"grad_norm": 0.7371602654457092,
"learning_rate": 1.4958525764597719e-06,
"loss": 0.8751,
"num_input_tokens_seen": 683212800,
"step": 83400
},
{
"epoch": 2.674738932667051,
"grad_norm": 3.058120012283325,
"learning_rate": 1.4671062552146342e-06,
"loss": 0.807,
"num_input_tokens_seen": 684032000,
"step": 83500
},
{
"epoch": 2.6779422128259336,
"grad_norm": 2.8297903537750244,
"learning_rate": 1.4386304832086333e-06,
"loss": 0.8519,
"num_input_tokens_seen": 684851200,
"step": 83600
},
{
"epoch": 2.6811454929848164,
"grad_norm": 0.5840158462524414,
"learning_rate": 1.4104255878175099e-06,
"loss": 0.7911,
"num_input_tokens_seen": 685670400,
"step": 83700
},
{
"epoch": 2.6843487731436992,
"grad_norm": 0.5358206629753113,
"learning_rate": 1.382491893302837e-06,
"loss": 0.85,
"num_input_tokens_seen": 686489600,
"step": 83800
},
{
"epoch": 2.687552053302582,
"grad_norm": 0.5446909666061401,
"learning_rate": 1.3548297208082678e-06,
"loss": 0.7469,
"num_input_tokens_seen": 687308800,
"step": 83900
},
{
"epoch": 2.6907553334614644,
"grad_norm": 0.7376157641410828,
"learning_rate": 1.3274393883558916e-06,
"loss": 0.815,
"num_input_tokens_seen": 688128000,
"step": 84000
},
{
"epoch": 2.6939586136203473,
"grad_norm": 2.3603358268737793,
"learning_rate": 1.3003212108425256e-06,
"loss": 0.8195,
"num_input_tokens_seen": 688947200,
"step": 84100
},
{
"epoch": 2.6971618937792297,
"grad_norm": 2.3444812297821045,
"learning_rate": 1.2734755000361393e-06,
"loss": 0.8265,
"num_input_tokens_seen": 689766400,
"step": 84200
},
{
"epoch": 2.7003651739381125,
"grad_norm": 0.7536035776138306,
"learning_rate": 1.2469025645722333e-06,
"loss": 0.8382,
"num_input_tokens_seen": 690585600,
"step": 84300
},
{
"epoch": 2.7035684540969953,
"grad_norm": 0.7054631114006042,
"learning_rate": 1.2206027099503275e-06,
"loss": 0.7791,
"num_input_tokens_seen": 691404800,
"step": 84400
},
{
"epoch": 2.706771734255878,
"grad_norm": 0.7819291353225708,
"learning_rate": 1.1945762385304122e-06,
"loss": 0.8321,
"num_input_tokens_seen": 692224000,
"step": 84500
},
{
"epoch": 2.709975014414761,
"grad_norm": 0.7501091361045837,
"learning_rate": 1.168823449529488e-06,
"loss": 0.8494,
"num_input_tokens_seen": 693043200,
"step": 84600
},
{
"epoch": 2.7131782945736433,
"grad_norm": 0.566743016242981,
"learning_rate": 1.1433446390181402e-06,
"loss": 0.8685,
"num_input_tokens_seen": 693862400,
"step": 84700
},
{
"epoch": 2.716381574732526,
"grad_norm": 2.204374313354492,
"learning_rate": 1.1181400999171144e-06,
"loss": 0.8147,
"num_input_tokens_seen": 694681600,
"step": 84800
},
{
"epoch": 2.7195848548914086,
"grad_norm": 2.641223192214966,
"learning_rate": 1.0932101219939594e-06,
"loss": 0.8259,
"num_input_tokens_seen": 695500800,
"step": 84900
},
{
"epoch": 2.7227881350502914,
"grad_norm": 0.747035562992096,
"learning_rate": 1.0685549918596882e-06,
"loss": 0.8737,
"num_input_tokens_seen": 696320000,
"step": 85000
},
{
"epoch": 2.725991415209174,
"grad_norm": 0.9778177738189697,
"learning_rate": 1.0441749929654827e-06,
"loss": 0.8358,
"num_input_tokens_seen": 697139200,
"step": 85100
},
{
"epoch": 2.729194695368057,
"grad_norm": 2.0086069107055664,
"learning_rate": 1.0200704055994548e-06,
"loss": 0.8231,
"num_input_tokens_seen": 697958400,
"step": 85200
},
{
"epoch": 2.73239797552694,
"grad_norm": 0.7290952801704407,
"learning_rate": 9.962415068833968e-07,
"loss": 0.8211,
"num_input_tokens_seen": 698777600,
"step": 85300
},
{
"epoch": 2.7356012556858222,
"grad_norm": 0.6520437598228455,
"learning_rate": 9.726885707696114e-07,
"loss": 0.8776,
"num_input_tokens_seen": 699596800,
"step": 85400
},
{
"epoch": 2.738804535844705,
"grad_norm": 0.5633389353752136,
"learning_rate": 9.494118680377612e-07,
"loss": 0.8198,
"num_input_tokens_seen": 700416000,
"step": 85500
},
{
"epoch": 2.7420078160035875,
"grad_norm": 0.8410841822624207,
"learning_rate": 9.264116662917405e-07,
"loss": 0.8894,
"num_input_tokens_seen": 701235200,
"step": 85600
},
{
"epoch": 2.7452110961624703,
"grad_norm": 2.9148612022399902,
"learning_rate": 9.036882299566229e-07,
"loss": 0.8259,
"num_input_tokens_seen": 702054400,
"step": 85700
},
{
"epoch": 2.748414376321353,
"grad_norm": 0.5637199878692627,
"learning_rate": 8.812418202756107e-07,
"loss": 0.7636,
"num_input_tokens_seen": 702873600,
"step": 85800
},
{
"epoch": 2.751617656480236,
"grad_norm": 0.5929956436157227,
"learning_rate": 8.590726953070228e-07,
"loss": 0.8448,
"num_input_tokens_seen": 703692800,
"step": 85900
},
{
"epoch": 2.7548209366391183,
"grad_norm": 0.5491350889205933,
"learning_rate": 8.371811099213394e-07,
"loss": 0.8467,
"num_input_tokens_seen": 704512000,
"step": 86000
},
{
"epoch": 2.758024216798001,
"grad_norm": 1.0223699808120728,
"learning_rate": 8.155673157982601e-07,
"loss": 0.8133,
"num_input_tokens_seen": 705331200,
"step": 86100
},
{
"epoch": 2.761227496956884,
"grad_norm": 1.5225611925125122,
"learning_rate": 7.942315614238277e-07,
"loss": 0.8109,
"num_input_tokens_seen": 706150400,
"step": 86200
},
{
"epoch": 2.7644307771157663,
"grad_norm": 0.8148054480552673,
"learning_rate": 7.731740920875613e-07,
"loss": 0.821,
"num_input_tokens_seen": 706969600,
"step": 86300
},
{
"epoch": 2.767634057274649,
"grad_norm": 0.7864372730255127,
"learning_rate": 7.523951498796283e-07,
"loss": 0.8135,
"num_input_tokens_seen": 707788800,
"step": 86400
},
{
"epoch": 2.770837337433532,
"grad_norm": 2.5619330406188965,
"learning_rate": 7.318949736880798e-07,
"loss": 0.7905,
"num_input_tokens_seen": 708608000,
"step": 86500
},
{
"epoch": 2.774040617592415,
"grad_norm": 1.5780519247055054,
"learning_rate": 7.116737991960831e-07,
"loss": 0.8608,
"num_input_tokens_seen": 709427200,
"step": 86600
},
{
"epoch": 2.777243897751297,
"grad_norm": 0.666118323802948,
"learning_rate": 6.917318588792299e-07,
"loss": 0.8586,
"num_input_tokens_seen": 710246400,
"step": 86700
},
{
"epoch": 2.78044717791018,
"grad_norm": 0.5050229430198669,
"learning_rate": 6.720693820028629e-07,
"loss": 0.8473,
"num_input_tokens_seen": 711065600,
"step": 86800
},
{
"epoch": 2.783650458069063,
"grad_norm": 0.5586540699005127,
"learning_rate": 6.526865946194172e-07,
"loss": 0.8182,
"num_input_tokens_seen": 711884800,
"step": 86900
},
{
"epoch": 2.7868537382279452,
"grad_norm": 0.6938973665237427,
"learning_rate": 6.335837195658528e-07,
"loss": 0.8493,
"num_input_tokens_seen": 712704000,
"step": 87000
},
{
"epoch": 2.790057018386828,
"grad_norm": 0.8710479736328125,
"learning_rate": 6.147609764610707e-07,
"loss": 0.8134,
"num_input_tokens_seen": 713523200,
"step": 87100
},
{
"epoch": 2.793260298545711,
"grad_norm": 2.5295767784118652,
"learning_rate": 5.962185817034005e-07,
"loss": 0.7893,
"num_input_tokens_seen": 714342400,
"step": 87200
},
{
"epoch": 2.7964635787045937,
"grad_norm": 0.5434448719024658,
"learning_rate": 5.779567484681032e-07,
"loss": 0.7896,
"num_input_tokens_seen": 715161600,
"step": 87300
},
{
"epoch": 2.799666858863476,
"grad_norm": 2.833872079849243,
"learning_rate": 5.599756867049221e-07,
"loss": 0.8185,
"num_input_tokens_seen": 715980800,
"step": 87400
},
{
"epoch": 2.802870139022359,
"grad_norm": 0.5753843784332275,
"learning_rate": 5.422756031356779e-07,
"loss": 0.8188,
"num_input_tokens_seen": 716800000,
"step": 87500
},
{
"epoch": 2.8060734191812418,
"grad_norm": 0.6721400022506714,
"learning_rate": 5.248567012518857e-07,
"loss": 0.8303,
"num_input_tokens_seen": 717619200,
"step": 87600
},
{
"epoch": 2.809276699340124,
"grad_norm": 0.7175859808921814,
"learning_rate": 5.077191813124105e-07,
"loss": 0.7866,
"num_input_tokens_seen": 718438400,
"step": 87700
},
{
"epoch": 2.812479979499007,
"grad_norm": 0.9649165868759155,
"learning_rate": 4.90863240341169e-07,
"loss": 0.8269,
"num_input_tokens_seen": 719257600,
"step": 87800
},
{
"epoch": 2.81568325965789,
"grad_norm": 0.5693693161010742,
"learning_rate": 4.742890721248755e-07,
"loss": 0.7737,
"num_input_tokens_seen": 720076800,
"step": 87900
},
{
"epoch": 2.8188865398167726,
"grad_norm": 0.6442407369613647,
"learning_rate": 4.579968672107943e-07,
"loss": 0.8196,
"num_input_tokens_seen": 720896000,
"step": 88000
},
{
"epoch": 2.822089819975655,
"grad_norm": 0.72199547290802,
"learning_rate": 4.419868129045629e-07,
"loss": 0.7998,
"num_input_tokens_seen": 721715200,
"step": 88100
},
{
"epoch": 2.825293100134538,
"grad_norm": 1.2243154048919678,
"learning_rate": 4.2625909326803325e-07,
"loss": 0.8534,
"num_input_tokens_seen": 722534400,
"step": 88200
},
{
"epoch": 2.82849638029342,
"grad_norm": 0.8224316835403442,
"learning_rate": 4.1081388911715645e-07,
"loss": 0.8262,
"num_input_tokens_seen": 723353600,
"step": 88300
},
{
"epoch": 2.831699660452303,
"grad_norm": 0.7001350522041321,
"learning_rate": 3.9565137801990395e-07,
"loss": 0.8323,
"num_input_tokens_seen": 724172800,
"step": 88400
},
{
"epoch": 2.834902940611186,
"grad_norm": 0.7441889643669128,
"learning_rate": 3.807717342942302e-07,
"loss": 0.8116,
"num_input_tokens_seen": 724992000,
"step": 88500
},
{
"epoch": 2.8381062207700687,
"grad_norm": 0.6325407028198242,
"learning_rate": 3.661751290060633e-07,
"loss": 0.8481,
"num_input_tokens_seen": 725811200,
"step": 88600
},
{
"epoch": 2.8413095009289515,
"grad_norm": 0.9763919711112976,
"learning_rate": 3.5186172996733714e-07,
"loss": 0.8084,
"num_input_tokens_seen": 726630400,
"step": 88700
},
{
"epoch": 2.844512781087834,
"grad_norm": 0.6528813242912292,
"learning_rate": 3.3783170173406764e-07,
"loss": 0.7923,
"num_input_tokens_seen": 727449600,
"step": 88800
},
{
"epoch": 2.8477160612467167,
"grad_norm": 0.8190716505050659,
"learning_rate": 3.2408520560445463e-07,
"loss": 0.8397,
"num_input_tokens_seen": 728268800,
"step": 88900
},
{
"epoch": 2.850919341405599,
"grad_norm": 0.6821821928024292,
"learning_rate": 3.10622399617036e-07,
"loss": 0.7856,
"num_input_tokens_seen": 729088000,
"step": 89000
},
{
"epoch": 2.854122621564482,
"grad_norm": 0.9017992615699768,
"learning_rate": 2.9744343854886393e-07,
"loss": 0.8271,
"num_input_tokens_seen": 729907200,
"step": 89100
},
{
"epoch": 2.8573259017233648,
"grad_norm": 0.6816012263298035,
"learning_rate": 2.8454847391372886e-07,
"loss": 0.8334,
"num_input_tokens_seen": 730726400,
"step": 89200
},
{
"epoch": 2.8605291818822476,
"grad_norm": 1.0822001695632935,
"learning_rate": 2.719376539604107e-07,
"loss": 0.8198,
"num_input_tokens_seen": 731545600,
"step": 89300
},
{
"epoch": 2.86373246204113,
"grad_norm": 0.782041072845459,
"learning_rate": 2.5961112367098306e-07,
"loss": 0.8199,
"num_input_tokens_seen": 732364800,
"step": 89400
},
{
"epoch": 2.866935742200013,
"grad_norm": 1.8875998258590698,
"learning_rate": 2.4756902475914777e-07,
"loss": 0.7963,
"num_input_tokens_seen": 733184000,
"step": 89500
},
{
"epoch": 2.8701390223588956,
"grad_norm": 0.549452543258667,
"learning_rate": 2.358114956685975e-07,
"loss": 0.8353,
"num_input_tokens_seen": 734003200,
"step": 89600
},
{
"epoch": 2.873342302517778,
"grad_norm": 1.3322216272354126,
"learning_rate": 2.243386715714224e-07,
"loss": 0.8547,
"num_input_tokens_seen": 734822400,
"step": 89700
},
{
"epoch": 2.876545582676661,
"grad_norm": 0.8102174997329712,
"learning_rate": 2.1315068436656983e-07,
"loss": 0.8233,
"num_input_tokens_seen": 735641600,
"step": 89800
},
{
"epoch": 2.8797488628355437,
"grad_norm": 0.6969431042671204,
"learning_rate": 2.0224766267831207e-07,
"loss": 0.8622,
"num_input_tokens_seen": 736460800,
"step": 89900
},
{
"epoch": 2.8829521429944265,
"grad_norm": 1.4771400690078735,
"learning_rate": 1.9162973185478383e-07,
"loss": 0.789,
"num_input_tokens_seen": 737280000,
"step": 90000
},
{
"epoch": 2.886155423153309,
"grad_norm": 0.6978898048400879,
"learning_rate": 1.8129701396652487e-07,
"loss": 0.8723,
"num_input_tokens_seen": 738099200,
"step": 90100
},
{
"epoch": 2.8893587033121917,
"grad_norm": 0.838759183883667,
"learning_rate": 1.7124962780508957e-07,
"loss": 0.8136,
"num_input_tokens_seen": 738918400,
"step": 90200
},
{
"epoch": 2.8925619834710745,
"grad_norm": 0.6396787762641907,
"learning_rate": 1.6148768888166744e-07,
"loss": 0.8263,
"num_input_tokens_seen": 739737600,
"step": 90300
},
{
"epoch": 2.895765263629957,
"grad_norm": 0.7068443298339844,
"learning_rate": 1.5201130942577578e-07,
"loss": 0.8388,
"num_input_tokens_seen": 740556800,
"step": 90400
},
{
"epoch": 2.8989685437888397,
"grad_norm": 0.5743166208267212,
"learning_rate": 1.4282059838394701e-07,
"loss": 0.8284,
"num_input_tokens_seen": 741376000,
"step": 90500
},
{
"epoch": 2.9021718239477226,
"grad_norm": 0.5627537369728088,
"learning_rate": 1.3391566141848778e-07,
"loss": 0.834,
"num_input_tokens_seen": 742195200,
"step": 90600
},
{
"epoch": 2.9053751041066054,
"grad_norm": 2.069951057434082,
"learning_rate": 1.2529660090626894e-07,
"loss": 0.8798,
"num_input_tokens_seen": 743014400,
"step": 90700
},
{
"epoch": 2.9085783842654878,
"grad_norm": 0.5723984241485596,
"learning_rate": 1.1696351593753485e-07,
"loss": 0.8443,
"num_input_tokens_seen": 743833600,
"step": 90800
},
{
"epoch": 2.9117816644243706,
"grad_norm": 0.5584101676940918,
"learning_rate": 1.0891650231477646e-07,
"loss": 0.7991,
"num_input_tokens_seen": 744652800,
"step": 90900
},
{
"epoch": 2.9149849445832534,
"grad_norm": 0.8929557800292969,
"learning_rate": 1.0115565255162107e-07,
"loss": 0.8134,
"num_input_tokens_seen": 745472000,
"step": 91000
},
{
"epoch": 2.918188224742136,
"grad_norm": 0.5613967776298523,
"learning_rate": 9.368105587177767e-08,
"loss": 0.855,
"num_input_tokens_seen": 746291200,
"step": 91100
},
{
"epoch": 2.9213915049010186,
"grad_norm": 0.5235220193862915,
"learning_rate": 8.649279820800161e-08,
"loss": 0.7894,
"num_input_tokens_seen": 747110400,
"step": 91200
},
{
"epoch": 2.9245947850599014,
"grad_norm": 2.220933198928833,
"learning_rate": 7.959096220111206e-08,
"loss": 0.8311,
"num_input_tokens_seen": 747929600,
"step": 91300
},
{
"epoch": 2.9277980652187843,
"grad_norm": 2.264698028564453,
"learning_rate": 7.297562719904561e-08,
"loss": 0.7856,
"num_input_tokens_seen": 748748800,
"step": 91400
},
{
"epoch": 2.9310013453776667,
"grad_norm": 0.6808698773384094,
"learning_rate": 6.664686925593188e-08,
"loss": 0.8379,
"num_input_tokens_seen": 749568000,
"step": 91500
},
{
"epoch": 2.9342046255365495,
"grad_norm": 2.1781809329986572,
"learning_rate": 6.060476113123603e-08,
"loss": 0.7529,
"num_input_tokens_seen": 750387200,
"step": 91600
},
{
"epoch": 2.937407905695432,
"grad_norm": 0.6591463685035706,
"learning_rate": 5.4849372288903744e-08,
"loss": 0.8836,
"num_input_tokens_seen": 751206400,
"step": 91700
},
{
"epoch": 2.9406111858543147,
"grad_norm": 0.5385074019432068,
"learning_rate": 4.9380768896578614e-08,
"loss": 0.8253,
"num_input_tokens_seen": 752025600,
"step": 91800
},
{
"epoch": 2.9438144660131975,
"grad_norm": 0.7810553312301636,
"learning_rate": 4.419901382483327e-08,
"loss": 0.7867,
"num_input_tokens_seen": 752844800,
"step": 91900
},
{
"epoch": 2.9470177461720803,
"grad_norm": 1.6066702604293823,
"learning_rate": 3.930416664644498e-08,
"loss": 0.8089,
"num_input_tokens_seen": 753664000,
"step": 92000
},
{
"epoch": 2.950221026330963,
"grad_norm": 0.8969001173973083,
"learning_rate": 3.469628363571564e-08,
"loss": 0.8324,
"num_input_tokens_seen": 754483200,
"step": 92100
},
{
"epoch": 2.9534243064898456,
"grad_norm": 0.6381150484085083,
"learning_rate": 3.037541776782782e-08,
"loss": 0.8199,
"num_input_tokens_seen": 755302400,
"step": 92200
},
{
"epoch": 2.9566275866487284,
"grad_norm": 0.8189881443977356,
"learning_rate": 2.6341618718223048e-08,
"loss": 0.8282,
"num_input_tokens_seen": 756121600,
"step": 92300
},
{
"epoch": 2.9598308668076108,
"grad_norm": 0.744215190410614,
"learning_rate": 2.2594932862041173e-08,
"loss": 0.823,
"num_input_tokens_seen": 756940800,
"step": 92400
},
{
"epoch": 2.9630341469664936,
"grad_norm": 0.6979692578315735,
"learning_rate": 1.91354032735902e-08,
"loss": 0.7854,
"num_input_tokens_seen": 757760000,
"step": 92500
},
{
"epoch": 2.9662374271253764,
"grad_norm": 0.6506592035293579,
"learning_rate": 1.5963069725838385e-08,
"loss": 0.8654,
"num_input_tokens_seen": 758579200,
"step": 92600
},
{
"epoch": 2.9694407072842592,
"grad_norm": 0.7221033573150635,
"learning_rate": 1.3077968689964582e-08,
"loss": 0.7966,
"num_input_tokens_seen": 759398400,
"step": 92700
},
{
"epoch": 2.9726439874431416,
"grad_norm": 0.5663209557533264,
"learning_rate": 1.0480133334947462e-08,
"loss": 0.8375,
"num_input_tokens_seen": 760217600,
"step": 92800
},
{
"epoch": 2.9758472676020244,
"grad_norm": 0.7616459131240845,
"learning_rate": 8.169593527160291e-09,
"loss": 0.8056,
"num_input_tokens_seen": 761036800,
"step": 92900
},
{
"epoch": 2.9790505477609073,
"grad_norm": 0.7259778380393982,
"learning_rate": 6.146375830054507e-09,
"loss": 0.8026,
"num_input_tokens_seen": 761856000,
"step": 93000
},
{
"epoch": 2.9822538279197897,
"grad_norm": 0.6411218643188477,
"learning_rate": 4.410503503840535e-09,
"loss": 0.8472,
"num_input_tokens_seen": 762675200,
"step": 93100
},
{
"epoch": 2.9854571080786725,
"grad_norm": 0.6619647741317749,
"learning_rate": 2.961996505213005e-09,
"loss": 0.8558,
"num_input_tokens_seen": 763494400,
"step": 93200
},
{
"epoch": 2.9886603882375553,
"grad_norm": 0.7283292412757874,
"learning_rate": 1.8008714871453613e-09,
"loss": 0.8321,
"num_input_tokens_seen": 764313600,
"step": 93300
},
{
"epoch": 2.991863668396438,
"grad_norm": 0.7489187717437744,
"learning_rate": 9.271417986705943e-10,
"loss": 0.8264,
"num_input_tokens_seen": 765132800,
"step": 93400
},
{
"epoch": 2.9950669485553205,
"grad_norm": 2.186750888824463,
"learning_rate": 3.408174847480128e-10,
"loss": 0.7796,
"num_input_tokens_seen": 765952000,
"step": 93500
},
{
"epoch": 2.9982702287142033,
"grad_norm": 2.5423426628112793,
"learning_rate": 4.1905286135568434e-11,
"loss": 0.7863,
"num_input_tokens_seen": 766771200,
"step": 93600
},
{
"epoch": 3.0,
"num_input_tokens_seen": 767213568,
"step": 93654,
"total_flos": 3.49334314435121e+19,
"train_loss": 0.04966252789047391,
"train_runtime": 28761.9651,
"train_samples_per_second": 3.256,
"train_steps_per_second": 3.256
}
],
"logging_steps": 100,
"max_steps": 93654,
"num_input_tokens_seen": 767213568,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.49334314435121e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}