TianyiQ's picture
Upload folder using huggingface_hub
59bfb93 verified
raw
history blame
131 kB
{
"best_metric": 2.27061128616333,
"best_model_checkpoint": "./output/training_results/C018_random_sample_llama3-8b-base_pretrain_20240504_182259/checkpoint-800",
"epoch": 4.0,
"eval_steps": 200,
"global_step": 3660,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001092896174863388,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.5038,
"step": 1
},
{
"epoch": 0.00546448087431694,
"grad_norm": 6.018359004510701,
"learning_rate": 1.5e-06,
"loss": 2.4907,
"step": 5
},
{
"epoch": 0.01092896174863388,
"grad_norm": 3.4017007364457332,
"learning_rate": 5.25e-06,
"loss": 2.4315,
"step": 10
},
{
"epoch": 0.01639344262295082,
"grad_norm": 2.6900944944121132,
"learning_rate": 8.25e-06,
"loss": 2.428,
"step": 15
},
{
"epoch": 0.02185792349726776,
"grad_norm": 2.708090744472938,
"learning_rate": 1.2e-05,
"loss": 2.4387,
"step": 20
},
{
"epoch": 0.0273224043715847,
"grad_norm": 3.032951222015636,
"learning_rate": 1.4954883435929662e-05,
"loss": 2.4481,
"step": 25
},
{
"epoch": 0.03278688524590164,
"grad_norm": 2.4292246650027147,
"learning_rate": 1.4731151665173554e-05,
"loss": 2.466,
"step": 30
},
{
"epoch": 0.03825136612021858,
"grad_norm": 2.621759707907051,
"learning_rate": 1.4510477122963378e-05,
"loss": 2.4376,
"step": 35
},
{
"epoch": 0.04371584699453552,
"grad_norm": 2.4177283273408454,
"learning_rate": 1.4292822159268742e-05,
"loss": 2.4828,
"step": 40
},
{
"epoch": 0.04918032786885246,
"grad_norm": 2.8501680697628307,
"learning_rate": 1.4078149536769946e-05,
"loss": 2.4589,
"step": 45
},
{
"epoch": 0.0546448087431694,
"grad_norm": 2.479759465440778,
"learning_rate": 1.386642242689401e-05,
"loss": 2.4807,
"step": 50
},
{
"epoch": 0.060109289617486336,
"grad_norm": 2.8217394805300566,
"learning_rate": 1.3657604405883384e-05,
"loss": 2.4267,
"step": 55
},
{
"epoch": 0.06557377049180328,
"grad_norm": 2.081431814197302,
"learning_rate": 1.3451659450897103e-05,
"loss": 2.4302,
"step": 60
},
{
"epoch": 0.07103825136612021,
"grad_norm": 2.4806889496772477,
"learning_rate": 1.3248551936144194e-05,
"loss": 2.4394,
"step": 65
},
{
"epoch": 0.07650273224043716,
"grad_norm": 2.440876359920224,
"learning_rate": 1.3048246629049058e-05,
"loss": 2.4125,
"step": 70
},
{
"epoch": 0.08196721311475409,
"grad_norm": 2.0932998526603517,
"learning_rate": 1.2889996545293194e-05,
"loss": 2.4529,
"step": 75
},
{
"epoch": 0.08743169398907104,
"grad_norm": 2.1065047597686135,
"learning_rate": 1.2694647666488102e-05,
"loss": 2.4064,
"step": 80
},
{
"epoch": 0.09289617486338798,
"grad_norm": 2.547164860929408,
"learning_rate": 1.2502004403786172e-05,
"loss": 2.4265,
"step": 85
},
{
"epoch": 0.09836065573770492,
"grad_norm": 2.1438020912476348,
"learning_rate": 1.2312032984416495e-05,
"loss": 2.4098,
"step": 90
},
{
"epoch": 0.10382513661202186,
"grad_norm": 2.068160218649763,
"learning_rate": 1.212470001085604e-05,
"loss": 2.4031,
"step": 95
},
{
"epoch": 0.1092896174863388,
"grad_norm": 1.9843644588123759,
"learning_rate": 1.1939972457176422e-05,
"loss": 2.405,
"step": 100
},
{
"epoch": 0.11475409836065574,
"grad_norm": 2.1172473045661224,
"learning_rate": 1.175781766542116e-05,
"loss": 2.3911,
"step": 105
},
{
"epoch": 0.12021857923497267,
"grad_norm": 2.3445926712239196,
"learning_rate": 1.1613924507166693e-05,
"loss": 2.3869,
"step": 110
},
{
"epoch": 0.12568306010928962,
"grad_norm": 2.025043424580075,
"learning_rate": 1.1436319549649206e-05,
"loss": 2.391,
"step": 115
},
{
"epoch": 0.13114754098360656,
"grad_norm": 1.9766925379731093,
"learning_rate": 1.1261197838924792e-05,
"loss": 2.4065,
"step": 120
},
{
"epoch": 0.1366120218579235,
"grad_norm": 2.0255472574517768,
"learning_rate": 1.1088528082822625e-05,
"loss": 2.4301,
"step": 125
},
{
"epoch": 0.14207650273224043,
"grad_norm": 2.2637369635110933,
"learning_rate": 1.0918279340172864e-05,
"loss": 2.3594,
"step": 130
},
{
"epoch": 0.14754098360655737,
"grad_norm": 2.053999551417616,
"learning_rate": 1.0750421017356817e-05,
"loss": 2.3738,
"step": 135
},
{
"epoch": 0.15300546448087432,
"grad_norm": 2.305037126151136,
"learning_rate": 1.0584922864886185e-05,
"loss": 2.3832,
"step": 140
},
{
"epoch": 0.15846994535519127,
"grad_norm": 2.1180967831426787,
"learning_rate": 1.0421754974011241e-05,
"loss": 2.413,
"step": 145
},
{
"epoch": 0.16393442622950818,
"grad_norm": 2.1528001939587402,
"learning_rate": 1.026088777335768e-05,
"loss": 2.3649,
"step": 150
},
{
"epoch": 0.16939890710382513,
"grad_norm": 1.9416004094979256,
"learning_rate": 1.0102292025591967e-05,
"loss": 2.3733,
"step": 155
},
{
"epoch": 0.17486338797814208,
"grad_norm": 2.2225363194253847,
"learning_rate": 9.945938824114975e-06,
"loss": 2.385,
"step": 160
},
{
"epoch": 0.18032786885245902,
"grad_norm": 2.492677416034468,
"learning_rate": 9.791799589783724e-06,
"loss": 2.3586,
"step": 165
},
{
"epoch": 0.18579234972677597,
"grad_norm": 1.9241743990594526,
"learning_rate": 9.639846067661005e-06,
"loss": 2.3548,
"step": 170
},
{
"epoch": 0.1912568306010929,
"grad_norm": 1.9740476668210596,
"learning_rate": 9.490050323792687e-06,
"loss": 2.3768,
"step": 175
},
{
"epoch": 0.19672131147540983,
"grad_norm": 2.0445759366709106,
"learning_rate": 9.342384742012546e-06,
"loss": 2.4061,
"step": 180
},
{
"epoch": 0.20218579234972678,
"grad_norm": 1.9639271133424887,
"learning_rate": 9.19682202077437e-06,
"loss": 2.3726,
"step": 185
},
{
"epoch": 0.20765027322404372,
"grad_norm": 1.9663469004265115,
"learning_rate": 9.053335170011187e-06,
"loss": 2.3515,
"step": 190
},
{
"epoch": 0.21311475409836064,
"grad_norm": 1.9350297245856483,
"learning_rate": 8.911897508021392e-06,
"loss": 2.3408,
"step": 195
},
{
"epoch": 0.2185792349726776,
"grad_norm": 1.8993527753581194,
"learning_rate": 8.77248265838164e-06,
"loss": 2.3701,
"step": 200
},
{
"epoch": 0.2185792349726776,
"eval_loss": 2.3701858520507812,
"eval_runtime": 75.1482,
"eval_samples_per_second": 86.589,
"eval_steps_per_second": 0.679,
"step": 200
},
{
"epoch": 0.22404371584699453,
"grad_norm": 2.203924855542508,
"learning_rate": 8.635064546886168e-06,
"loss": 2.3966,
"step": 205
},
{
"epoch": 0.22950819672131148,
"grad_norm": 2.114129121333607,
"learning_rate": 8.499617398512568e-06,
"loss": 2.3397,
"step": 210
},
{
"epoch": 0.23497267759562843,
"grad_norm": 1.924587792624844,
"learning_rate": 8.366115734413646e-06,
"loss": 2.3665,
"step": 215
},
{
"epoch": 0.24043715846994534,
"grad_norm": 1.8589409579909668,
"learning_rate": 8.234534368935251e-06,
"loss": 2.3603,
"step": 220
},
{
"epoch": 0.2459016393442623,
"grad_norm": 2.045586546826662,
"learning_rate": 8.104848406659907e-06,
"loss": 2.3569,
"step": 225
},
{
"epoch": 0.25136612021857924,
"grad_norm": 1.9470161431434365,
"learning_rate": 7.97703323947598e-06,
"loss": 2.322,
"step": 230
},
{
"epoch": 0.2568306010928962,
"grad_norm": 1.8292713581809432,
"learning_rate": 7.85106454367231e-06,
"loss": 2.3516,
"step": 235
},
{
"epoch": 0.26229508196721313,
"grad_norm": 1.8494090625326762,
"learning_rate": 7.72691827705802e-06,
"loss": 2.375,
"step": 240
},
{
"epoch": 0.2677595628415301,
"grad_norm": 2.161027732489493,
"learning_rate": 7.604570676107382e-06,
"loss": 2.3498,
"step": 245
},
{
"epoch": 0.273224043715847,
"grad_norm": 2.1707598899295357,
"learning_rate": 7.483998253129525e-06,
"loss": 2.3503,
"step": 250
},
{
"epoch": 0.2786885245901639,
"grad_norm": 1.8564170426077466,
"learning_rate": 7.365177793462842e-06,
"loss": 2.3285,
"step": 255
},
{
"epoch": 0.28415300546448086,
"grad_norm": 1.9463620290299803,
"learning_rate": 7.248086352693862e-06,
"loss": 2.3287,
"step": 260
},
{
"epoch": 0.2896174863387978,
"grad_norm": 2.0448861366457924,
"learning_rate": 7.132701253900465e-06,
"loss": 2.3307,
"step": 265
},
{
"epoch": 0.29508196721311475,
"grad_norm": 2.0011605708324685,
"learning_rate": 7.019000084919226e-06,
"loss": 2.3445,
"step": 270
},
{
"epoch": 0.3005464480874317,
"grad_norm": 1.7859540910895997,
"learning_rate": 6.906960695636718e-06,
"loss": 2.3176,
"step": 275
},
{
"epoch": 0.30601092896174864,
"grad_norm": 2.0882334857896554,
"learning_rate": 6.796561195304612e-06,
"loss": 2.3152,
"step": 280
},
{
"epoch": 0.3114754098360656,
"grad_norm": 1.8550717237355474,
"learning_rate": 6.687779949878386e-06,
"loss": 2.3072,
"step": 285
},
{
"epoch": 0.31693989071038253,
"grad_norm": 1.9231867190142091,
"learning_rate": 6.580595579379473e-06,
"loss": 2.3527,
"step": 290
},
{
"epoch": 0.3224043715846995,
"grad_norm": 2.2210554738167056,
"learning_rate": 6.474986955280685e-06,
"loss": 2.3422,
"step": 295
},
{
"epoch": 0.32786885245901637,
"grad_norm": 2.0430624582463506,
"learning_rate": 6.370933197914722e-06,
"loss": 2.3153,
"step": 300
},
{
"epoch": 0.3333333333333333,
"grad_norm": 2.0678711431760104,
"learning_rate": 6.268413673905618e-06,
"loss": 2.3097,
"step": 305
},
{
"epoch": 0.33879781420765026,
"grad_norm": 1.8084581993894073,
"learning_rate": 6.167407993622935e-06,
"loss": 2.3256,
"step": 310
},
{
"epoch": 0.3442622950819672,
"grad_norm": 1.9468221957558098,
"learning_rate": 6.067896008658554e-06,
"loss": 2.3447,
"step": 315
},
{
"epoch": 0.34972677595628415,
"grad_norm": 1.995128197802868,
"learning_rate": 5.9698578093258756e-06,
"loss": 2.3063,
"step": 320
},
{
"epoch": 0.3551912568306011,
"grad_norm": 1.9717788395704754,
"learning_rate": 5.873273722181316e-06,
"loss": 2.3468,
"step": 325
},
{
"epoch": 0.36065573770491804,
"grad_norm": 1.7044787526539047,
"learning_rate": 5.778124307567816e-06,
"loss": 2.3458,
"step": 330
},
{
"epoch": 0.366120218579235,
"grad_norm": 2.0310677469626994,
"learning_rate": 5.68439035718035e-06,
"loss": 2.3099,
"step": 335
},
{
"epoch": 0.37158469945355194,
"grad_norm": 1.8797720229376973,
"learning_rate": 5.592052891653163e-06,
"loss": 2.3293,
"step": 340
},
{
"epoch": 0.3770491803278688,
"grad_norm": 1.7420600285844794,
"learning_rate": 5.5010931581686135e-06,
"loss": 2.3347,
"step": 345
},
{
"epoch": 0.3825136612021858,
"grad_norm": 1.923012658321935,
"learning_rate": 5.411492628087456e-06,
"loss": 2.2903,
"step": 350
},
{
"epoch": 0.3879781420765027,
"grad_norm": 1.8521170693883549,
"learning_rate": 5.3232329946004e-06,
"loss": 2.3296,
"step": 355
},
{
"epoch": 0.39344262295081966,
"grad_norm": 1.8510472078686617,
"learning_rate": 5.2362961704007885e-06,
"loss": 2.3372,
"step": 360
},
{
"epoch": 0.3989071038251366,
"grad_norm": 2.0309505396989302,
"learning_rate": 5.150664285378238e-06,
"loss": 2.2872,
"step": 365
},
{
"epoch": 0.40437158469945356,
"grad_norm": 1.8697259417175387,
"learning_rate": 5.06631968433308e-06,
"loss": 2.3182,
"step": 370
},
{
"epoch": 0.4098360655737705,
"grad_norm": 1.829952724705,
"learning_rate": 4.9832449247114525e-06,
"loss": 2.2973,
"step": 375
},
{
"epoch": 0.41530054644808745,
"grad_norm": 1.7902123068449143,
"learning_rate": 4.901422774360872e-06,
"loss": 2.3068,
"step": 380
},
{
"epoch": 0.4207650273224044,
"grad_norm": 1.7833743483161062,
"learning_rate": 4.8208362093061525e-06,
"loss": 2.2842,
"step": 385
},
{
"epoch": 0.4262295081967213,
"grad_norm": 1.9963838229648958,
"learning_rate": 4.741468411545501e-06,
"loss": 2.2788,
"step": 390
},
{
"epoch": 0.43169398907103823,
"grad_norm": 2.165728407748183,
"learning_rate": 4.6633027668666485e-06,
"loss": 2.2629,
"step": 395
},
{
"epoch": 0.4371584699453552,
"grad_norm": 1.8956899938550533,
"learning_rate": 4.58632286268284e-06,
"loss": 2.3183,
"step": 400
},
{
"epoch": 0.4371584699453552,
"eval_loss": 2.316016674041748,
"eval_runtime": 75.0612,
"eval_samples_per_second": 86.689,
"eval_steps_per_second": 0.679,
"step": 400
},
{
"epoch": 0.4426229508196721,
"grad_norm": 1.9009361654227148,
"learning_rate": 4.510512485888576e-06,
"loss": 2.3128,
"step": 405
},
{
"epoch": 0.44808743169398907,
"grad_norm": 1.8723633542124947,
"learning_rate": 4.435855620734914e-06,
"loss": 2.2849,
"step": 410
},
{
"epoch": 0.453551912568306,
"grad_norm": 1.739990729696985,
"learning_rate": 4.3623364467242e-06,
"loss": 2.323,
"step": 415
},
{
"epoch": 0.45901639344262296,
"grad_norm": 2.739418331731018,
"learning_rate": 4.289939336524074e-06,
"loss": 2.285,
"step": 420
},
{
"epoch": 0.4644808743169399,
"grad_norm": 1.744787908955572,
"learning_rate": 4.218648853900638e-06,
"loss": 2.3438,
"step": 425
},
{
"epoch": 0.46994535519125685,
"grad_norm": 2.108125431007958,
"learning_rate": 4.148449751670545e-06,
"loss": 2.2864,
"step": 430
},
{
"epoch": 0.47540983606557374,
"grad_norm": 1.7777684575501653,
"learning_rate": 4.0793269696719935e-06,
"loss": 2.2953,
"step": 435
},
{
"epoch": 0.4808743169398907,
"grad_norm": 1.9646082069769346,
"learning_rate": 4.011265632754383e-06,
"loss": 2.3371,
"step": 440
},
{
"epoch": 0.48633879781420764,
"grad_norm": 1.9015798878951815,
"learning_rate": 3.944251048786522e-06,
"loss": 2.2647,
"step": 445
},
{
"epoch": 0.4918032786885246,
"grad_norm": 1.946335104230161,
"learning_rate": 3.878268706683258e-06,
"loss": 2.2622,
"step": 450
},
{
"epoch": 0.4972677595628415,
"grad_norm": 1.7407268985234177,
"learning_rate": 3.8133042744503556e-06,
"loss": 2.2978,
"step": 455
},
{
"epoch": 0.5027322404371585,
"grad_norm": 2.0354514658677867,
"learning_rate": 3.7493435972475156e-06,
"loss": 2.3088,
"step": 460
},
{
"epoch": 0.5081967213114754,
"grad_norm": 1.8511439849509024,
"learning_rate": 3.686372695469369e-06,
"loss": 2.3243,
"step": 465
},
{
"epoch": 0.5136612021857924,
"grad_norm": 1.8563805349568043,
"learning_rate": 3.6243777628443207e-06,
"loss": 2.3126,
"step": 470
},
{
"epoch": 0.5191256830601093,
"grad_norm": 1.8151947657786947,
"learning_rate": 3.5633451645510976e-06,
"loss": 2.3406,
"step": 475
},
{
"epoch": 0.5245901639344263,
"grad_norm": 1.8105166181198042,
"learning_rate": 3.5032614353528692e-06,
"loss": 2.3148,
"step": 480
},
{
"epoch": 0.5300546448087432,
"grad_norm": 1.8770594853800158,
"learning_rate": 3.4441132777487983e-06,
"loss": 2.2673,
"step": 485
},
{
"epoch": 0.5355191256830601,
"grad_norm": 1.8047846783432062,
"learning_rate": 3.385887560142889e-06,
"loss": 2.2999,
"step": 490
},
{
"epoch": 0.5409836065573771,
"grad_norm": 1.8251621882251348,
"learning_rate": 3.3285713150299956e-06,
"loss": 2.2806,
"step": 495
},
{
"epoch": 0.546448087431694,
"grad_norm": 1.871841118264576,
"learning_rate": 3.27215173719886e-06,
"loss": 2.2755,
"step": 500
},
{
"epoch": 0.5519125683060109,
"grad_norm": 1.7071197085633982,
"learning_rate": 3.216616181952041e-06,
"loss": 2.3256,
"step": 505
},
{
"epoch": 0.5573770491803278,
"grad_norm": 1.8202076520109158,
"learning_rate": 3.161952163342607e-06,
"loss": 2.2326,
"step": 510
},
{
"epoch": 0.5628415300546448,
"grad_norm": 2.101153989436264,
"learning_rate": 3.1081473524274575e-06,
"loss": 2.2992,
"step": 515
},
{
"epoch": 0.5683060109289617,
"grad_norm": 2.8542639158592804,
"learning_rate": 3.0551895755371417e-06,
"loss": 2.2662,
"step": 520
},
{
"epoch": 0.5737704918032787,
"grad_norm": 1.9673021719695818,
"learning_rate": 3.00306681256205e-06,
"loss": 2.3003,
"step": 525
},
{
"epoch": 0.5792349726775956,
"grad_norm": 2.010083775622308,
"learning_rate": 2.9517671952548357e-06,
"loss": 2.3146,
"step": 530
},
{
"epoch": 0.5846994535519126,
"grad_norm": 1.8194888309578177,
"learning_rate": 2.9012790055489625e-06,
"loss": 2.2817,
"step": 535
},
{
"epoch": 0.5901639344262295,
"grad_norm": 3.4489573911262608,
"learning_rate": 2.8515906738932173e-06,
"loss": 2.2923,
"step": 540
},
{
"epoch": 0.5956284153005464,
"grad_norm": 1.7960363620990365,
"learning_rate": 2.8026907776020966e-06,
"loss": 2.2396,
"step": 545
},
{
"epoch": 0.6010928961748634,
"grad_norm": 1.9801171472834103,
"learning_rate": 2.7545680392219096e-06,
"loss": 2.2668,
"step": 550
},
{
"epoch": 0.6065573770491803,
"grad_norm": 1.7618650836948095,
"learning_rate": 2.7072113249124913e-06,
"loss": 2.2449,
"step": 555
},
{
"epoch": 0.6120218579234973,
"grad_norm": 1.730914258843425,
"learning_rate": 2.660609642844413e-06,
"loss": 2.2918,
"step": 560
},
{
"epoch": 0.6174863387978142,
"grad_norm": 1.7646145040322634,
"learning_rate": 2.6147521416115106e-06,
"loss": 2.2862,
"step": 565
},
{
"epoch": 0.6229508196721312,
"grad_norm": 1.9128717333080465,
"learning_rate": 2.5696281086586865e-06,
"loss": 2.2657,
"step": 570
},
{
"epoch": 0.6284153005464481,
"grad_norm": 1.956303743152218,
"learning_rate": 2.5252269687248056e-06,
"loss": 2.3029,
"step": 575
},
{
"epoch": 0.6338797814207651,
"grad_norm": 1.7226771008230806,
"learning_rate": 2.4815382823005854e-06,
"loss": 2.2454,
"step": 580
},
{
"epoch": 0.639344262295082,
"grad_norm": 1.8222023652295238,
"learning_rate": 2.4385517441013565e-06,
"loss": 2.3003,
"step": 585
},
{
"epoch": 0.644808743169399,
"grad_norm": 1.8097760740890172,
"learning_rate": 2.3962571815545747e-06,
"loss": 2.3239,
"step": 590
},
{
"epoch": 0.6502732240437158,
"grad_norm": 1.7469489003388072,
"learning_rate": 2.3546445533019647e-06,
"loss": 2.289,
"step": 595
},
{
"epoch": 0.6557377049180327,
"grad_norm": 1.8488648855172372,
"learning_rate": 2.31370394771618e-06,
"loss": 2.2634,
"step": 600
},
{
"epoch": 0.6557377049180327,
"eval_loss": 2.2862629890441895,
"eval_runtime": 75.0848,
"eval_samples_per_second": 86.662,
"eval_steps_per_second": 0.679,
"step": 600
},
{
"epoch": 0.6612021857923497,
"grad_norm": 1.775242640587005,
"learning_rate": 2.2734255814318526e-06,
"loss": 2.2729,
"step": 605
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.7202766576440889,
"learning_rate": 2.233799797890934e-06,
"loss": 2.2784,
"step": 610
},
{
"epoch": 0.6721311475409836,
"grad_norm": 1.808717147716665,
"learning_rate": 2.1948170659021868e-06,
"loss": 2.2501,
"step": 615
},
{
"epoch": 0.6775956284153005,
"grad_norm": 1.7433893633830992,
"learning_rate": 2.1564679782147374e-06,
"loss": 2.2937,
"step": 620
},
{
"epoch": 0.6830601092896175,
"grad_norm": 1.7254327868818564,
"learning_rate": 2.1187432501055544e-06,
"loss": 2.3049,
"step": 625
},
{
"epoch": 0.6885245901639344,
"grad_norm": 1.7514572806831676,
"learning_rate": 2.0816337179807527e-06,
"loss": 2.2563,
"step": 630
},
{
"epoch": 0.6939890710382514,
"grad_norm": 2.517555263627969,
"learning_rate": 2.0451303379906046e-06,
"loss": 2.2915,
"step": 635
},
{
"epoch": 0.6994535519125683,
"grad_norm": 1.841614406691522,
"learning_rate": 2.0092241846581427e-06,
"loss": 2.2846,
"step": 640
},
{
"epoch": 0.7049180327868853,
"grad_norm": 2.117519428379047,
"learning_rate": 1.973906449521264e-06,
"loss": 2.2822,
"step": 645
},
{
"epoch": 0.7103825136612022,
"grad_norm": 1.775749159266723,
"learning_rate": 1.9391684397881756e-06,
"loss": 2.2472,
"step": 650
},
{
"epoch": 0.7158469945355191,
"grad_norm": 1.7229676762831452,
"learning_rate": 1.9050015770061387e-06,
"loss": 2.2924,
"step": 655
},
{
"epoch": 0.7213114754098361,
"grad_norm": 1.7135336336426077,
"learning_rate": 1.8713973957433444e-06,
"loss": 2.2932,
"step": 660
},
{
"epoch": 0.726775956284153,
"grad_norm": 1.746651653634065,
"learning_rate": 1.838347542283849e-06,
"loss": 2.2625,
"step": 665
},
{
"epoch": 0.73224043715847,
"grad_norm": 1.8692885279821523,
"learning_rate": 1.8058437733354382e-06,
"loss": 2.2856,
"step": 670
},
{
"epoch": 0.7377049180327869,
"grad_norm": 1.8208277427071937,
"learning_rate": 1.773877954750328e-06,
"loss": 2.2477,
"step": 675
},
{
"epoch": 0.7431693989071039,
"grad_norm": 1.6812537080705303,
"learning_rate": 1.7424420602585894e-06,
"loss": 2.3132,
"step": 680
},
{
"epoch": 0.7486338797814208,
"grad_norm": 2.0801892129831256,
"learning_rate": 1.7115281702141926e-06,
"loss": 2.2575,
"step": 685
},
{
"epoch": 0.7540983606557377,
"grad_norm": 1.8414116320368654,
"learning_rate": 1.6811284703535634e-06,
"loss": 2.2476,
"step": 690
},
{
"epoch": 0.7595628415300546,
"grad_norm": 2.0566662495103483,
"learning_rate": 1.651235250566554e-06,
"loss": 2.2569,
"step": 695
},
{
"epoch": 0.7650273224043715,
"grad_norm": 2.016755831922365,
"learning_rate": 1.6218409036797155e-06,
"loss": 2.2568,
"step": 700
},
{
"epoch": 0.7704918032786885,
"grad_norm": 1.7180035270775444,
"learning_rate": 1.592937924251778e-06,
"loss": 2.2993,
"step": 705
},
{
"epoch": 0.7759562841530054,
"grad_norm": 1.7480243896979724,
"learning_rate": 1.5645189073812295e-06,
"loss": 2.2602,
"step": 710
},
{
"epoch": 0.7814207650273224,
"grad_norm": 1.7826578176545964,
"learning_rate": 1.5365765475258971e-06,
"loss": 2.2554,
"step": 715
},
{
"epoch": 0.7868852459016393,
"grad_norm": 1.7135083462521725,
"learning_rate": 1.5091036373344258e-06,
"loss": 2.2941,
"step": 720
},
{
"epoch": 0.7923497267759563,
"grad_norm": 1.829935840802774,
"learning_rate": 1.4820930664895563e-06,
"loss": 2.2986,
"step": 725
},
{
"epoch": 0.7978142076502732,
"grad_norm": 1.731928027216758,
"learning_rate": 1.455537820563104e-06,
"loss": 2.249,
"step": 730
},
{
"epoch": 0.8032786885245902,
"grad_norm": 1.7873474828840332,
"learning_rate": 1.4294309798825372e-06,
"loss": 2.2462,
"step": 735
},
{
"epoch": 0.8087431693989071,
"grad_norm": 1.759568248731093,
"learning_rate": 1.4037657184090597e-06,
"loss": 2.2722,
"step": 740
},
{
"epoch": 0.8142076502732241,
"grad_norm": 1.7115339550033273,
"learning_rate": 1.3785353026270964e-06,
"loss": 2.2739,
"step": 745
},
{
"epoch": 0.819672131147541,
"grad_norm": 1.6802608563862464,
"learning_rate": 1.3537330904450898e-06,
"loss": 2.2312,
"step": 750
},
{
"epoch": 0.825136612021858,
"grad_norm": 1.7819162358568228,
"learning_rate": 1.3293525301075076e-06,
"loss": 2.2691,
"step": 755
},
{
"epoch": 0.8306010928961749,
"grad_norm": 1.7268343293878012,
"learning_rate": 1.305387159117968e-06,
"loss": 2.3017,
"step": 760
},
{
"epoch": 0.8360655737704918,
"grad_norm": 1.9444235134875572,
"learning_rate": 1.2818306031733856e-06,
"loss": 2.2924,
"step": 765
},
{
"epoch": 0.8415300546448088,
"grad_norm": 1.7510208764482034,
"learning_rate": 1.258676575109047e-06,
"loss": 2.2897,
"step": 770
},
{
"epoch": 0.8469945355191257,
"grad_norm": 2.517375736052748,
"learning_rate": 1.2359188738545197e-06,
"loss": 2.2454,
"step": 775
},
{
"epoch": 0.8524590163934426,
"grad_norm": 1.6966653908275375,
"learning_rate": 1.2135513834003019e-06,
"loss": 2.2569,
"step": 780
},
{
"epoch": 0.8579234972677595,
"grad_norm": 1.91572460682662,
"learning_rate": 1.1915680717751282e-06,
"loss": 2.2454,
"step": 785
},
{
"epoch": 0.8633879781420765,
"grad_norm": 1.7753619527615636,
"learning_rate": 1.1699629900338182e-06,
"loss": 2.271,
"step": 790
},
{
"epoch": 0.8688524590163934,
"grad_norm": 1.7960345912349553,
"learning_rate": 1.1487302712556065e-06,
"loss": 2.2328,
"step": 795
},
{
"epoch": 0.8743169398907104,
"grad_norm": 1.8256697032153515,
"learning_rate": 1.1278641295528428e-06,
"loss": 2.2522,
"step": 800
},
{
"epoch": 0.8743169398907104,
"eval_loss": 2.27061128616333,
"eval_runtime": 75.1121,
"eval_samples_per_second": 86.631,
"eval_steps_per_second": 0.679,
"step": 800
},
{
"epoch": 0.8797814207650273,
"grad_norm": 1.755415386789429,
"learning_rate": 1.1073588590899781e-06,
"loss": 2.2794,
"step": 805
},
{
"epoch": 0.8852459016393442,
"grad_norm": 1.82410498387524,
"learning_rate": 1.087208833112751e-06,
"loss": 2.285,
"step": 810
},
{
"epoch": 0.8907103825136612,
"grad_norm": 1.7491199970554299,
"learning_rate": 1.0674085029874798e-06,
"loss": 2.2838,
"step": 815
},
{
"epoch": 0.8961748633879781,
"grad_norm": 1.7620440534843038,
"learning_rate": 1.0479523972503778e-06,
"loss": 2.2571,
"step": 820
},
{
"epoch": 0.9016393442622951,
"grad_norm": 1.7623871723124545,
"learning_rate": 1.0288351206668029e-06,
"loss": 2.2152,
"step": 825
},
{
"epoch": 0.907103825136612,
"grad_norm": 1.7314270943428405,
"learning_rate": 1.0100513533003527e-06,
"loss": 2.2728,
"step": 830
},
{
"epoch": 0.912568306010929,
"grad_norm": 1.7654507075500774,
"learning_rate": 9.915958495917222e-07,
"loss": 2.247,
"step": 835
},
{
"epoch": 0.9180327868852459,
"grad_norm": 1.678925667557596,
"learning_rate": 9.734634374472352e-07,
"loss": 2.2616,
"step": 840
},
{
"epoch": 0.9234972677595629,
"grad_norm": 1.7684124052868442,
"learning_rate": 9.556490173369703e-07,
"loss": 2.2862,
"step": 845
},
{
"epoch": 0.9289617486338798,
"grad_norm": 3.0732219671025516,
"learning_rate": 9.381475614023894e-07,
"loss": 2.2431,
"step": 850
},
{
"epoch": 0.9344262295081968,
"grad_norm": 1.7623273850966537,
"learning_rate": 9.209541125733917e-07,
"loss": 2.2347,
"step": 855
},
{
"epoch": 0.9398907103825137,
"grad_norm": 1.7652819800497817,
"learning_rate": 9.040637836947072e-07,
"loss": 2.2397,
"step": 860
},
{
"epoch": 0.9453551912568307,
"grad_norm": 2.309656697777325,
"learning_rate": 8.874717566615452e-07,
"loss": 2.2653,
"step": 865
},
{
"epoch": 0.9508196721311475,
"grad_norm": 1.7843612739885204,
"learning_rate": 8.711732815644269e-07,
"loss": 2.2434,
"step": 870
},
{
"epoch": 0.9562841530054644,
"grad_norm": 1.7109714272710808,
"learning_rate": 8.551636758430965e-07,
"loss": 2.2745,
"step": 875
},
{
"epoch": 0.9617486338797814,
"grad_norm": 1.7579744867576292,
"learning_rate": 8.394383234494619e-07,
"loss": 2.2248,
"step": 880
},
{
"epoch": 0.9672131147540983,
"grad_norm": 1.849160968341628,
"learning_rate": 8.239926740194595e-07,
"loss": 2.251,
"step": 885
},
{
"epoch": 0.9726775956284153,
"grad_norm": 1.861581611214087,
"learning_rate": 8.088222420537758e-07,
"loss": 2.2483,
"step": 890
},
{
"epoch": 0.9781420765027322,
"grad_norm": 1.7754689870317883,
"learning_rate": 7.939226061073428e-07,
"loss": 2.2332,
"step": 895
},
{
"epoch": 0.9836065573770492,
"grad_norm": 1.7804277680548917,
"learning_rate": 7.792894079875298e-07,
"loss": 2.236,
"step": 900
},
{
"epoch": 0.9890710382513661,
"grad_norm": 1.8706013663334191,
"learning_rate": 7.649183519609543e-07,
"loss": 2.2355,
"step": 905
},
{
"epoch": 0.994535519125683,
"grad_norm": 2.1654744337173804,
"learning_rate": 7.508052039688325e-07,
"loss": 2.2716,
"step": 910
},
{
"epoch": 1.0,
"grad_norm": 1.8923905206181715,
"learning_rate": 7.369457908507959e-07,
"loss": 2.2432,
"step": 915
},
{
"epoch": 1.005464480874317,
"grad_norm": 1.8669783431369535,
"learning_rate": 7.233359995770941e-07,
"loss": 2.0815,
"step": 920
},
{
"epoch": 1.010928961748634,
"grad_norm": 1.8664223098231116,
"learning_rate": 7.09971776489111e-07,
"loss": 2.1179,
"step": 925
},
{
"epoch": 1.0163934426229508,
"grad_norm": 2.0035276022188526,
"learning_rate": 6.968491265481181e-07,
"loss": 2.0239,
"step": 930
},
{
"epoch": 1.0218579234972678,
"grad_norm": 1.863776527509761,
"learning_rate": 6.839641125921904e-07,
"loss": 2.0409,
"step": 935
},
{
"epoch": 1.0273224043715847,
"grad_norm": 1.8550407448868003,
"learning_rate": 6.713128546012103e-07,
"loss": 2.0766,
"step": 940
},
{
"epoch": 1.0327868852459017,
"grad_norm": 1.836653625153128,
"learning_rate": 6.588915289698876e-07,
"loss": 2.0376,
"step": 945
},
{
"epoch": 1.0382513661202186,
"grad_norm": 1.8030661735223916,
"learning_rate": 6.466963677887208e-07,
"loss": 2.0702,
"step": 950
},
{
"epoch": 1.0437158469945356,
"grad_norm": 1.9561619861336457,
"learning_rate": 6.347236581328288e-07,
"loss": 2.0205,
"step": 955
},
{
"epoch": 1.0491803278688525,
"grad_norm": 1.8564459874702657,
"learning_rate": 6.229697413585796e-07,
"loss": 1.9857,
"step": 960
},
{
"epoch": 1.0546448087431695,
"grad_norm": 1.8936006989320295,
"learning_rate": 6.114310124079459e-07,
"loss": 2.0398,
"step": 965
},
{
"epoch": 1.0601092896174864,
"grad_norm": 1.9195082055604684,
"learning_rate": 6.001039191205155e-07,
"loss": 2.1075,
"step": 970
},
{
"epoch": 1.0655737704918034,
"grad_norm": 1.8455934194504435,
"learning_rate": 5.88984961553089e-07,
"loss": 2.0609,
"step": 975
},
{
"epoch": 1.0710382513661203,
"grad_norm": 1.8016262977421866,
"learning_rate": 5.780706913067893e-07,
"loss": 2.0502,
"step": 980
},
{
"epoch": 1.0765027322404372,
"grad_norm": 1.7677047836486395,
"learning_rate": 5.673577108616207e-07,
"loss": 2.051,
"step": 985
},
{
"epoch": 1.0819672131147542,
"grad_norm": 1.8046971572225328,
"learning_rate": 5.568426729184038e-07,
"loss": 2.0531,
"step": 990
},
{
"epoch": 1.0874316939890711,
"grad_norm": 1.82987759218833,
"learning_rate": 5.465222797480186e-07,
"loss": 2.0766,
"step": 995
},
{
"epoch": 1.092896174863388,
"grad_norm": 1.9232052942988758,
"learning_rate": 5.3639328254789e-07,
"loss": 2.0306,
"step": 1000
},
{
"epoch": 1.092896174863388,
"eval_loss": 2.277691602706909,
"eval_runtime": 75.0135,
"eval_samples_per_second": 86.744,
"eval_steps_per_second": 0.68,
"step": 1000
},
{
"epoch": 1.098360655737705,
"grad_norm": 2.0428068706971323,
"learning_rate": 5.264524808056471e-07,
"loss": 2.0239,
"step": 1005
},
{
"epoch": 1.1038251366120218,
"grad_norm": 1.8366344916179231,
"learning_rate": 5.166967216698893e-07,
"loss": 2.0634,
"step": 1010
},
{
"epoch": 1.1092896174863387,
"grad_norm": 1.7965628794594979,
"learning_rate": 5.071228993279937e-07,
"loss": 2.0611,
"step": 1015
},
{
"epoch": 1.1147540983606556,
"grad_norm": 2.017245605772054,
"learning_rate": 4.977279543908971e-07,
"loss": 2.0588,
"step": 1020
},
{
"epoch": 1.1202185792349726,
"grad_norm": 1.8313145099357355,
"learning_rate": 4.885088732847877e-07,
"loss": 2.0667,
"step": 1025
},
{
"epoch": 1.1256830601092895,
"grad_norm": 1.8185093106173156,
"learning_rate": 4.794626876496447e-07,
"loss": 2.0602,
"step": 1030
},
{
"epoch": 1.1311475409836065,
"grad_norm": 1.7971795633377927,
"learning_rate": 4.705864737445532e-07,
"loss": 2.0819,
"step": 1035
},
{
"epoch": 1.1366120218579234,
"grad_norm": 1.804877775071399,
"learning_rate": 4.6187735185974027e-07,
"loss": 2.0733,
"step": 1040
},
{
"epoch": 1.1420765027322404,
"grad_norm": 1.8882874062597697,
"learning_rate": 4.53332485735264e-07,
"loss": 2.0624,
"step": 1045
},
{
"epoch": 1.1475409836065573,
"grad_norm": 1.8725308660946791,
"learning_rate": 4.4494908198629223e-07,
"loss": 2.0751,
"step": 1050
},
{
"epoch": 1.1530054644808743,
"grad_norm": 1.832601726764631,
"learning_rate": 4.3672438953490993e-07,
"loss": 2.0633,
"step": 1055
},
{
"epoch": 1.1584699453551912,
"grad_norm": 1.864300674472044,
"learning_rate": 4.2865569904839347e-07,
"loss": 2.0313,
"step": 1060
},
{
"epoch": 1.1639344262295082,
"grad_norm": 1.8685178757975862,
"learning_rate": 4.2074034238388927e-07,
"loss": 2.0323,
"step": 1065
},
{
"epoch": 1.169398907103825,
"grad_norm": 1.9710058409039382,
"learning_rate": 4.129756920394366e-07,
"loss": 2.0582,
"step": 1070
},
{
"epoch": 1.174863387978142,
"grad_norm": 1.8687716914721453,
"learning_rate": 4.0535916061127434e-07,
"loss": 2.0985,
"step": 1075
},
{
"epoch": 1.180327868852459,
"grad_norm": 1.8500938997613081,
"learning_rate": 3.9788820025736986e-07,
"loss": 2.0767,
"step": 1080
},
{
"epoch": 1.185792349726776,
"grad_norm": 1.8312389667512146,
"learning_rate": 3.905603021671151e-07,
"loss": 2.0657,
"step": 1085
},
{
"epoch": 1.1912568306010929,
"grad_norm": 1.867588510082228,
"learning_rate": 3.833729960371216e-07,
"loss": 2.0341,
"step": 1090
},
{
"epoch": 1.1967213114754098,
"grad_norm": 2.0320023546596793,
"learning_rate": 3.763238495530669e-07,
"loss": 2.0428,
"step": 1095
},
{
"epoch": 1.2021857923497268,
"grad_norm": 1.8098546698756057,
"learning_rate": 3.6941046787752674e-07,
"loss": 2.0333,
"step": 1100
},
{
"epoch": 1.2076502732240437,
"grad_norm": 1.8386846652746143,
"learning_rate": 3.626304931437368e-07,
"loss": 2.0554,
"step": 1105
},
{
"epoch": 1.2131147540983607,
"grad_norm": 1.8784877277085623,
"learning_rate": 3.559816039552281e-07,
"loss": 2.0227,
"step": 1110
},
{
"epoch": 1.2185792349726776,
"grad_norm": 1.8657671721210465,
"learning_rate": 3.494615148912776e-07,
"loss": 2.0451,
"step": 1115
},
{
"epoch": 1.2240437158469946,
"grad_norm": 1.8007719413939671,
"learning_rate": 3.430679760181184e-07,
"loss": 2.0583,
"step": 1120
},
{
"epoch": 1.2295081967213115,
"grad_norm": 1.870766368370614,
"learning_rate": 3.367987724058537e-07,
"loss": 2.0488,
"step": 1125
},
{
"epoch": 1.2349726775956285,
"grad_norm": 1.8322491897546949,
"learning_rate": 3.3065172365101784e-07,
"loss": 2.0705,
"step": 1130
},
{
"epoch": 1.2404371584699454,
"grad_norm": 1.8536889671409005,
"learning_rate": 3.2462468340473055e-07,
"loss": 2.0704,
"step": 1135
},
{
"epoch": 1.2459016393442623,
"grad_norm": 1.8359760782064882,
"learning_rate": 3.1871553890638926e-07,
"loss": 2.0451,
"step": 1140
},
{
"epoch": 1.2513661202185793,
"grad_norm": 1.8344433175084502,
"learning_rate": 3.129222105228447e-07,
"loss": 2.0329,
"step": 1145
},
{
"epoch": 1.2568306010928962,
"grad_norm": 1.8537487825008587,
"learning_rate": 3.0724265129300667e-07,
"loss": 2.0534,
"step": 1150
},
{
"epoch": 1.2622950819672132,
"grad_norm": 1.8408591589072794,
"learning_rate": 3.016748464778264e-07,
"loss": 2.0942,
"step": 1155
},
{
"epoch": 1.2677595628415301,
"grad_norm": 1.8078246234829183,
"learning_rate": 2.962168131156018e-07,
"loss": 2.1283,
"step": 1160
},
{
"epoch": 1.273224043715847,
"grad_norm": 2.1814718317735906,
"learning_rate": 2.9086659958255433e-07,
"loss": 2.0702,
"step": 1165
},
{
"epoch": 1.278688524590164,
"grad_norm": 1.8045654405880427,
"learning_rate": 2.85622285158624e-07,
"loss": 2.0367,
"step": 1170
},
{
"epoch": 1.2841530054644807,
"grad_norm": 1.8472377696538738,
"learning_rate": 2.804819795984313e-07,
"loss": 2.0416,
"step": 1175
},
{
"epoch": 1.289617486338798,
"grad_norm": 1.905366081667851,
"learning_rate": 2.7544382270735544e-07,
"loss": 2.0775,
"step": 1180
},
{
"epoch": 1.2950819672131146,
"grad_norm": 1.9614873478866022,
"learning_rate": 2.7050598392267637e-07,
"loss": 2.0373,
"step": 1185
},
{
"epoch": 1.3005464480874318,
"grad_norm": 1.8310655830774525,
"learning_rate": 2.6566666189973166e-07,
"loss": 1.9924,
"step": 1190
},
{
"epoch": 1.3060109289617485,
"grad_norm": 1.7818072860023078,
"learning_rate": 2.609240841030368e-07,
"loss": 2.0684,
"step": 1195
},
{
"epoch": 1.3114754098360657,
"grad_norm": 1.9094992756338325,
"learning_rate": 2.5627650640232037e-07,
"loss": 2.0095,
"step": 1200
},
{
"epoch": 1.3114754098360657,
"eval_loss": 2.275972366333008,
"eval_runtime": 75.1224,
"eval_samples_per_second": 86.619,
"eval_steps_per_second": 0.679,
"step": 1200
},
{
"epoch": 1.3169398907103824,
"grad_norm": 1.8692781473355056,
"learning_rate": 2.517222126734241e-07,
"loss": 2.0688,
"step": 1205
},
{
"epoch": 1.3224043715846996,
"grad_norm": 1.952487765852821,
"learning_rate": 2.4725951440401845e-07,
"loss": 2.0702,
"step": 1210
},
{
"epoch": 1.3278688524590163,
"grad_norm": 1.8695789749802114,
"learning_rate": 2.428867503040866e-07,
"loss": 2.0588,
"step": 1215
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.8261227342272521,
"learning_rate": 2.386022859211273e-07,
"loss": 2.0136,
"step": 1220
},
{
"epoch": 1.3387978142076502,
"grad_norm": 1.7222392372978628,
"learning_rate": 2.3440451326002926e-07,
"loss": 2.0569,
"step": 1225
},
{
"epoch": 1.3442622950819672,
"grad_norm": 1.9067634498498296,
"learning_rate": 2.3029185040757038e-07,
"loss": 2.0261,
"step": 1230
},
{
"epoch": 1.349726775956284,
"grad_norm": 1.8423558641225324,
"learning_rate": 2.262627411614938e-07,
"loss": 2.0907,
"step": 1235
},
{
"epoch": 1.355191256830601,
"grad_norm": 1.840981313277747,
"learning_rate": 2.2231565466411502e-07,
"loss": 2.0525,
"step": 1240
},
{
"epoch": 1.360655737704918,
"grad_norm": 1.9131767631552514,
"learning_rate": 2.184490850404133e-07,
"loss": 2.0632,
"step": 1245
},
{
"epoch": 1.366120218579235,
"grad_norm": 1.787761186102589,
"learning_rate": 2.146615510405616e-07,
"loss": 2.0723,
"step": 1250
},
{
"epoch": 1.3715846994535519,
"grad_norm": 2.0011420142483685,
"learning_rate": 2.1095159568685124e-07,
"loss": 2.0347,
"step": 1255
},
{
"epoch": 1.3770491803278688,
"grad_norm": 2.04737119019968,
"learning_rate": 2.0731778592496148e-07,
"loss": 2.0157,
"step": 1260
},
{
"epoch": 1.3825136612021858,
"grad_norm": 1.8546271061398376,
"learning_rate": 2.03758712279536e-07,
"loss": 2.0558,
"step": 1265
},
{
"epoch": 1.3879781420765027,
"grad_norm": 1.823598167691669,
"learning_rate": 2.0027298851401635e-07,
"loss": 2.0707,
"step": 1270
},
{
"epoch": 1.3934426229508197,
"grad_norm": 1.8364088203878515,
"learning_rate": 1.968592512946914e-07,
"loss": 2.0616,
"step": 1275
},
{
"epoch": 1.3989071038251366,
"grad_norm": 1.8504039969740431,
"learning_rate": 1.935161598589178e-07,
"loss": 2.0442,
"step": 1280
},
{
"epoch": 1.4043715846994536,
"grad_norm": 1.8647380065818375,
"learning_rate": 1.902423956874689e-07,
"loss": 2.0309,
"step": 1285
},
{
"epoch": 1.4098360655737705,
"grad_norm": 1.8378312471521248,
"learning_rate": 1.870366621809691e-07,
"loss": 2.0322,
"step": 1290
},
{
"epoch": 1.4153005464480874,
"grad_norm": 1.925113709625938,
"learning_rate": 1.8389768434037062e-07,
"loss": 2.0688,
"step": 1295
},
{
"epoch": 1.4207650273224044,
"grad_norm": 1.860586095258991,
"learning_rate": 1.8082420845143144e-07,
"loss": 2.0745,
"step": 1300
},
{
"epoch": 1.4262295081967213,
"grad_norm": 1.8852545058774595,
"learning_rate": 1.778150017731515e-07,
"loss": 2.076,
"step": 1305
},
{
"epoch": 1.4316939890710383,
"grad_norm": 1.941199221075769,
"learning_rate": 1.7486885223012617e-07,
"loss": 2.0019,
"step": 1310
},
{
"epoch": 1.4371584699453552,
"grad_norm": 1.8446186191327532,
"learning_rate": 1.719845681087774e-07,
"loss": 2.0626,
"step": 1315
},
{
"epoch": 1.4426229508196722,
"grad_norm": 1.9134786622014528,
"learning_rate": 1.6916097775741735e-07,
"loss": 2.0477,
"step": 1320
},
{
"epoch": 1.4480874316939891,
"grad_norm": 1.8176316380129849,
"learning_rate": 1.6639692929010962e-07,
"loss": 2.0296,
"step": 1325
},
{
"epoch": 1.453551912568306,
"grad_norm": 1.8469951284525707,
"learning_rate": 1.636912902942842e-07,
"loss": 2.0342,
"step": 1330
},
{
"epoch": 1.459016393442623,
"grad_norm": 1.8740314655221872,
"learning_rate": 1.6104294754206772e-07,
"loss": 2.0445,
"step": 1335
},
{
"epoch": 1.46448087431694,
"grad_norm": 2.003913447054603,
"learning_rate": 1.5845080670528932e-07,
"loss": 2.0545,
"step": 1340
},
{
"epoch": 1.469945355191257,
"grad_norm": 1.856769949790638,
"learning_rate": 1.559137920741231e-07,
"loss": 2.0106,
"step": 1345
},
{
"epoch": 1.4754098360655736,
"grad_norm": 1.9009712565408305,
"learning_rate": 1.534308462793285e-07,
"loss": 2.0312,
"step": 1350
},
{
"epoch": 1.4808743169398908,
"grad_norm": 1.8649550487045021,
"learning_rate": 1.5100093001805e-07,
"loss": 2.058,
"step": 1355
},
{
"epoch": 1.4863387978142075,
"grad_norm": 1.8282766283581593,
"learning_rate": 1.486230217831383e-07,
"loss": 2.0109,
"step": 1360
},
{
"epoch": 1.4918032786885247,
"grad_norm": 1.8341029485175546,
"learning_rate": 1.462961175959548e-07,
"loss": 2.0767,
"step": 1365
},
{
"epoch": 1.4972677595628414,
"grad_norm": 1.9503794173682378,
"learning_rate": 1.4401923074262253e-07,
"loss": 2.0394,
"step": 1370
},
{
"epoch": 1.5027322404371586,
"grad_norm": 1.8750212125931591,
"learning_rate": 1.417913915136858e-07,
"loss": 2.061,
"step": 1375
},
{
"epoch": 1.5081967213114753,
"grad_norm": 1.850380238557527,
"learning_rate": 1.3961164694714208e-07,
"loss": 2.1208,
"step": 1380
},
{
"epoch": 1.5136612021857925,
"grad_norm": 1.883450204664466,
"learning_rate": 1.3747906057481e-07,
"loss": 2.041,
"step": 1385
},
{
"epoch": 1.5191256830601092,
"grad_norm": 2.0647139754404673,
"learning_rate": 1.3539271217199617e-07,
"loss": 2.0448,
"step": 1390
},
{
"epoch": 1.5245901639344264,
"grad_norm": 1.8198932393101204,
"learning_rate": 1.3335169751042653e-07,
"loss": 2.0706,
"step": 1395
},
{
"epoch": 1.530054644808743,
"grad_norm": 1.8262850198089926,
"learning_rate": 1.3135512811440523e-07,
"loss": 2.0539,
"step": 1400
},
{
"epoch": 1.530054644808743,
"eval_loss": 2.274564743041992,
"eval_runtime": 75.0617,
"eval_samples_per_second": 86.689,
"eval_steps_per_second": 0.679,
"step": 1400
},
{
"epoch": 1.5355191256830603,
"grad_norm": 2.1787152604764377,
"learning_rate": 1.294021310201668e-07,
"loss": 2.0272,
"step": 1405
},
{
"epoch": 1.540983606557377,
"grad_norm": 1.895178288065996,
"learning_rate": 1.2749184853838634e-07,
"loss": 2.0395,
"step": 1410
},
{
"epoch": 1.5464480874316942,
"grad_norm": 1.8243874586884308,
"learning_rate": 1.2562343801981296e-07,
"loss": 2.0385,
"step": 1415
},
{
"epoch": 1.5519125683060109,
"grad_norm": 1.8215546298276755,
"learning_rate": 1.237960716239925e-07,
"loss": 2.0299,
"step": 1420
},
{
"epoch": 1.5573770491803278,
"grad_norm": 1.9144206068231184,
"learning_rate": 1.2200893609104527e-07,
"loss": 2.0693,
"step": 1425
},
{
"epoch": 1.5628415300546448,
"grad_norm": 1.832800068410983,
"learning_rate": 1.2026123251646523e-07,
"loss": 2.0911,
"step": 1430
},
{
"epoch": 1.5683060109289617,
"grad_norm": 1.862861787979993,
"learning_rate": 1.1855217612890718e-07,
"loss": 2.0475,
"step": 1435
},
{
"epoch": 1.5737704918032787,
"grad_norm": 2.0333731738009293,
"learning_rate": 1.1688099607092871e-07,
"loss": 2.0482,
"step": 1440
},
{
"epoch": 1.5792349726775956,
"grad_norm": 1.8091303840445014,
"learning_rate": 1.1524693518265448e-07,
"loss": 2.0482,
"step": 1445
},
{
"epoch": 1.5846994535519126,
"grad_norm": 1.864510400120361,
"learning_rate": 1.136492497883297e-07,
"loss": 2.0948,
"step": 1450
},
{
"epoch": 1.5901639344262295,
"grad_norm": 1.9652692377864456,
"learning_rate": 1.1208720948573126e-07,
"loss": 2.0189,
"step": 1455
},
{
"epoch": 1.5956284153005464,
"grad_norm": 1.9202417329675314,
"learning_rate": 1.1056009693840394e-07,
"loss": 2.078,
"step": 1460
},
{
"epoch": 1.6010928961748634,
"grad_norm": 1.8209643705209526,
"learning_rate": 1.0906720767069055e-07,
"loss": 2.0417,
"step": 1465
},
{
"epoch": 1.6065573770491803,
"grad_norm": 1.8079220297452976,
"learning_rate": 1.0760784986552422e-07,
"loss": 2.041,
"step": 1470
},
{
"epoch": 1.6120218579234973,
"grad_norm": 1.9197478181290593,
"learning_rate": 1.0618134416495201e-07,
"loss": 2.0091,
"step": 1475
},
{
"epoch": 1.6174863387978142,
"grad_norm": 1.8520224557231018,
"learning_rate": 1.0478702347335883e-07,
"loss": 2.0082,
"step": 1480
},
{
"epoch": 1.6229508196721312,
"grad_norm": 1.8273112792953872,
"learning_rate": 1.0342423276336188e-07,
"loss": 2.0446,
"step": 1485
},
{
"epoch": 1.6284153005464481,
"grad_norm": 1.8630686076964935,
"learning_rate": 1.0209232888434338e-07,
"loss": 2.0629,
"step": 1490
},
{
"epoch": 1.633879781420765,
"grad_norm": 1.8271338340219678,
"learning_rate": 1.0079068037359431e-07,
"loss": 2.0609,
"step": 1495
},
{
"epoch": 1.639344262295082,
"grad_norm": 1.8144459145882181,
"learning_rate": 9.951866727003745e-08,
"loss": 2.0364,
"step": 1500
},
{
"epoch": 1.644808743169399,
"grad_norm": 1.8507906861527859,
"learning_rate": 9.827568093050098e-08,
"loss": 2.0506,
"step": 1505
},
{
"epoch": 1.650273224043716,
"grad_norm": 1.8319335425047658,
"learning_rate": 9.706112384851353e-08,
"loss": 2.0253,
"step": 1510
},
{
"epoch": 1.6557377049180326,
"grad_norm": 1.8984240206563825,
"learning_rate": 9.587440947559151e-08,
"loss": 2.0648,
"step": 1515
},
{
"epoch": 1.6612021857923498,
"grad_norm": 1.8507118687181,
"learning_rate": 9.471496204499047e-08,
"loss": 2.0231,
"step": 1520
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.9702444306141391,
"learning_rate": 9.358221639789162e-08,
"loss": 2.0409,
"step": 1525
},
{
"epoch": 1.6721311475409837,
"grad_norm": 1.8247902367319633,
"learning_rate": 9.247561781199593e-08,
"loss": 2.0205,
"step": 1530
},
{
"epoch": 1.6775956284153004,
"grad_norm": 1.8563007653589343,
"learning_rate": 9.139462183249743e-08,
"loss": 2.0488,
"step": 1535
},
{
"epoch": 1.6830601092896176,
"grad_norm": 1.8206552163382879,
"learning_rate": 9.033869410540892e-08,
"loss": 2.0166,
"step": 1540
},
{
"epoch": 1.6885245901639343,
"grad_norm": 1.9126349973731116,
"learning_rate": 8.930731021321133e-08,
"loss": 2.0486,
"step": 1545
},
{
"epoch": 1.6939890710382515,
"grad_norm": 1.8565348783702142,
"learning_rate": 8.829995551280143e-08,
"loss": 2.0342,
"step": 1550
},
{
"epoch": 1.6994535519125682,
"grad_norm": 1.909638179979103,
"learning_rate": 8.731612497570976e-08,
"loss": 2.073,
"step": 1555
},
{
"epoch": 1.7049180327868854,
"grad_norm": 1.8690803406902856,
"learning_rate": 8.635532303056259e-08,
"loss": 2.0231,
"step": 1560
},
{
"epoch": 1.710382513661202,
"grad_norm": 1.8909253306804354,
"learning_rate": 8.541706340776192e-08,
"loss": 2.0341,
"step": 1565
},
{
"epoch": 1.7158469945355193,
"grad_norm": 1.8844764835255978,
"learning_rate": 8.450086898635676e-08,
"loss": 2.0347,
"step": 1570
},
{
"epoch": 1.721311475409836,
"grad_norm": 1.75571408467022,
"learning_rate": 8.360627164308056e-08,
"loss": 2.0801,
"step": 1575
},
{
"epoch": 1.7267759562841531,
"grad_norm": 1.8436023942890172,
"learning_rate": 8.273281210352872e-08,
"loss": 2.0365,
"step": 1580
},
{
"epoch": 1.7322404371584699,
"grad_norm": 1.9085196731178369,
"learning_rate": 8.188003979545094e-08,
"loss": 2.0531,
"step": 1585
},
{
"epoch": 1.737704918032787,
"grad_norm": 1.8822253588875573,
"learning_rate": 8.104751270413362e-08,
"loss": 2.0784,
"step": 1590
},
{
"epoch": 1.7431693989071038,
"grad_norm": 1.788484127481047,
"learning_rate": 8.02347972298469e-08,
"loss": 2.0478,
"step": 1595
},
{
"epoch": 1.748633879781421,
"grad_norm": 1.8239121636996685,
"learning_rate": 7.944146804733213e-08,
"loss": 2.0338,
"step": 1600
},
{
"epoch": 1.748633879781421,
"eval_loss": 2.2742836475372314,
"eval_runtime": 75.0576,
"eval_samples_per_second": 86.693,
"eval_steps_per_second": 0.679,
"step": 1600
},
{
"epoch": 1.7540983606557377,
"grad_norm": 1.9239534582352587,
"learning_rate": 7.866710796730526e-08,
"loss": 2.0631,
"step": 1605
},
{
"epoch": 1.7595628415300546,
"grad_norm": 1.8903502091457296,
"learning_rate": 7.791130779995196e-08,
"loss": 2.0572,
"step": 1610
},
{
"epoch": 1.7650273224043715,
"grad_norm": 1.8293603401943201,
"learning_rate": 7.717366622039046e-08,
"loss": 2.0668,
"step": 1615
},
{
"epoch": 1.7704918032786885,
"grad_norm": 1.9173015833072757,
"learning_rate": 7.64537896360787e-08,
"loss": 2.0435,
"step": 1620
},
{
"epoch": 1.7759562841530054,
"grad_norm": 1.8793531033612623,
"learning_rate": 7.575129205614193e-08,
"loss": 2.0722,
"step": 1625
},
{
"epoch": 1.7814207650273224,
"grad_norm": 2.0902523194542084,
"learning_rate": 7.50657949625979e-08,
"loss": 2.0433,
"step": 1630
},
{
"epoch": 1.7868852459016393,
"grad_norm": 1.9107654817346211,
"learning_rate": 7.439692718345629e-08,
"loss": 2.0456,
"step": 1635
},
{
"epoch": 1.7923497267759563,
"grad_norm": 1.9360239043323952,
"learning_rate": 7.374432476766986e-08,
"loss": 2.006,
"step": 1640
},
{
"epoch": 1.7978142076502732,
"grad_norm": 1.9192977806152298,
"learning_rate": 7.310763086191462e-08,
"loss": 2.0468,
"step": 1645
},
{
"epoch": 1.8032786885245902,
"grad_norm": 1.8243879474864746,
"learning_rate": 7.248649558917661e-08,
"loss": 2.0798,
"step": 1650
},
{
"epoch": 1.8087431693989071,
"grad_norm": 1.8320967907842092,
"learning_rate": 7.18805759291233e-08,
"loss": 2.0515,
"step": 1655
},
{
"epoch": 1.814207650273224,
"grad_norm": 1.8532616512840305,
"learning_rate": 7.128953560023773e-08,
"loss": 2.0775,
"step": 1660
},
{
"epoch": 1.819672131147541,
"grad_norm": 1.838552382273461,
"learning_rate": 7.071304494369334e-08,
"loss": 2.0479,
"step": 1665
},
{
"epoch": 1.825136612021858,
"grad_norm": 1.924941490211915,
"learning_rate": 7.015078080894855e-08,
"loss": 2.0786,
"step": 1670
},
{
"epoch": 1.830601092896175,
"grad_norm": 2.143894042689188,
"learning_rate": 6.960242644103938e-08,
"loss": 2.0834,
"step": 1675
},
{
"epoch": 1.8360655737704918,
"grad_norm": 1.8651804657911415,
"learning_rate": 6.906767136954927e-08,
"loss": 2.0642,
"step": 1680
},
{
"epoch": 1.8415300546448088,
"grad_norm": 1.9275400611989582,
"learning_rate": 6.854621129923514e-08,
"loss": 2.0485,
"step": 1685
},
{
"epoch": 1.8469945355191257,
"grad_norm": 1.861313763790637,
"learning_rate": 6.803774800228914e-08,
"loss": 2.0999,
"step": 1690
},
{
"epoch": 1.8524590163934427,
"grad_norm": 1.8930724854627998,
"learning_rate": 6.754198921221566e-08,
"loss": 2.0448,
"step": 1695
},
{
"epoch": 1.8579234972677594,
"grad_norm": 1.8993885693049763,
"learning_rate": 6.705864851930317e-08,
"loss": 2.0511,
"step": 1700
},
{
"epoch": 1.8633879781420766,
"grad_norm": 1.883117813333527,
"learning_rate": 6.658744526767117e-08,
"loss": 2.0503,
"step": 1705
},
{
"epoch": 1.8688524590163933,
"grad_norm": 1.848538549934253,
"learning_rate": 6.612810445387236e-08,
"loss": 2.0636,
"step": 1710
},
{
"epoch": 1.8743169398907105,
"grad_norm": 1.8938252963914626,
"learning_rate": 6.568035662702993e-08,
"loss": 2.0718,
"step": 1715
},
{
"epoch": 1.8797814207650272,
"grad_norm": 1.941319110309079,
"learning_rate": 6.524393779049134e-08,
"loss": 2.0647,
"step": 1720
},
{
"epoch": 1.8852459016393444,
"grad_norm": 2.0070472308658207,
"learning_rate": 6.481858930497878e-08,
"loss": 2.0546,
"step": 1725
},
{
"epoch": 1.890710382513661,
"grad_norm": 2.054630166123197,
"learning_rate": 6.440405779321743e-08,
"loss": 2.0349,
"step": 1730
},
{
"epoch": 1.8961748633879782,
"grad_norm": 1.8560973783317283,
"learning_rate": 6.40000950460228e-08,
"loss": 2.053,
"step": 1735
},
{
"epoch": 1.901639344262295,
"grad_norm": 1.8560379830723175,
"learning_rate": 6.360645792982822e-08,
"loss": 2.0397,
"step": 1740
},
{
"epoch": 1.9071038251366121,
"grad_norm": 1.8759906071094705,
"learning_rate": 6.322290829563445e-08,
"loss": 2.0582,
"step": 1745
},
{
"epoch": 1.9125683060109289,
"grad_norm": 1.8523523069150685,
"learning_rate": 6.284921288936269e-08,
"loss": 2.0589,
"step": 1750
},
{
"epoch": 1.918032786885246,
"grad_norm": 1.7917256365306369,
"learning_rate": 6.248514326359321e-08,
"loss": 2.0742,
"step": 1755
},
{
"epoch": 1.9234972677595628,
"grad_norm": 1.841924086545583,
"learning_rate": 6.213047569067165e-08,
"loss": 2.0714,
"step": 1760
},
{
"epoch": 1.92896174863388,
"grad_norm": 1.8696658012304237,
"learning_rate": 6.178499107716513e-08,
"loss": 2.0,
"step": 1765
},
{
"epoch": 1.9344262295081966,
"grad_norm": 1.8879810882710348,
"learning_rate": 6.144847487965106e-08,
"loss": 2.0584,
"step": 1770
},
{
"epoch": 1.9398907103825138,
"grad_norm": 1.819319260545883,
"learning_rate": 6.112071702182056e-08,
"loss": 2.0353,
"step": 1775
},
{
"epoch": 1.9453551912568305,
"grad_norm": 1.8742671379299753,
"learning_rate": 6.080151181288026e-08,
"loss": 2.0478,
"step": 1780
},
{
"epoch": 1.9508196721311475,
"grad_norm": 1.8684517998018801,
"learning_rate": 6.049065786723472e-08,
"loss": 2.0565,
"step": 1785
},
{
"epoch": 1.9562841530054644,
"grad_norm": 1.8109987887119923,
"learning_rate": 6.018795802543315e-08,
"loss": 2.0587,
"step": 1790
},
{
"epoch": 1.9617486338797814,
"grad_norm": 1.9407341077135385,
"learning_rate": 5.98932192763636e-08,
"loss": 2.048,
"step": 1795
},
{
"epoch": 1.9672131147540983,
"grad_norm": 1.8354866267003231,
"learning_rate": 5.960625268067816e-08,
"loss": 2.0648,
"step": 1800
},
{
"epoch": 1.9672131147540983,
"eval_loss": 2.2736637592315674,
"eval_runtime": 75.0951,
"eval_samples_per_second": 86.65,
"eval_steps_per_second": 0.679,
"step": 1800
},
{
"epoch": 1.9726775956284153,
"grad_norm": 1.7987669216918003,
"learning_rate": 5.9326873295433023e-08,
"loss": 2.0055,
"step": 1805
},
{
"epoch": 1.9781420765027322,
"grad_norm": 1.8771206541798455,
"learning_rate": 5.905490009992716e-08,
"loss": 2.0875,
"step": 1810
},
{
"epoch": 1.9836065573770492,
"grad_norm": 1.798209591995569,
"learning_rate": 5.8790155922723804e-08,
"loss": 2.0414,
"step": 1815
},
{
"epoch": 1.989071038251366,
"grad_norm": 1.8140574162134413,
"learning_rate": 5.8532467369838935e-08,
"loss": 2.0476,
"step": 1820
},
{
"epoch": 1.994535519125683,
"grad_norm": 1.9205008465204905,
"learning_rate": 5.82816647540811e-08,
"loss": 2.0414,
"step": 1825
},
{
"epoch": 2.0,
"grad_norm": 1.900514133765418,
"learning_rate": 5.803758202552724e-08,
"loss": 2.0637,
"step": 1830
},
{
"epoch": 2.0054644808743167,
"grad_norm": 1.812710905593721,
"learning_rate": 5.780005670311929e-08,
"loss": 2.0017,
"step": 1835
},
{
"epoch": 2.010928961748634,
"grad_norm": 1.8787871615638423,
"learning_rate": 5.756892980736625e-08,
"loss": 1.9808,
"step": 1840
},
{
"epoch": 2.0163934426229506,
"grad_norm": 1.9008500311802838,
"learning_rate": 5.7344045794137134e-08,
"loss": 2.0183,
"step": 1845
},
{
"epoch": 2.021857923497268,
"grad_norm": 1.8439766004122011,
"learning_rate": 5.7125252489529687e-08,
"loss": 2.0492,
"step": 1850
},
{
"epoch": 2.0273224043715845,
"grad_norm": 1.8093023853453647,
"learning_rate": 5.6912401025800444e-08,
"loss": 2.0498,
"step": 1855
},
{
"epoch": 2.0327868852459017,
"grad_norm": 1.8621731843549314,
"learning_rate": 5.670534577834171e-08,
"loss": 2.0566,
"step": 1860
},
{
"epoch": 2.0382513661202184,
"grad_norm": 1.7945188171488486,
"learning_rate": 5.6503944303690994e-08,
"loss": 2.0399,
"step": 1865
},
{
"epoch": 2.0437158469945356,
"grad_norm": 1.8231871269338034,
"learning_rate": 5.630805727855896e-08,
"loss": 2.0348,
"step": 1870
},
{
"epoch": 2.0491803278688523,
"grad_norm": 1.9219456613473263,
"learning_rate": 5.611754843986178e-08,
"loss": 2.0056,
"step": 1875
},
{
"epoch": 2.0546448087431695,
"grad_norm": 1.7850350529775676,
"learning_rate": 5.5932284525744105e-08,
"loss": 2.0062,
"step": 1880
},
{
"epoch": 2.060109289617486,
"grad_norm": 1.9708916029467265,
"learning_rate": 5.5752135217578976e-08,
"loss": 2.0024,
"step": 1885
},
{
"epoch": 2.0655737704918034,
"grad_norm": 1.913183828308229,
"learning_rate": 5.55769730829312e-08,
"loss": 2.0277,
"step": 1890
},
{
"epoch": 2.07103825136612,
"grad_norm": 1.8542316127529779,
"learning_rate": 5.5406673519470675e-08,
"loss": 2.0015,
"step": 1895
},
{
"epoch": 2.0765027322404372,
"grad_norm": 1.87389845276975,
"learning_rate": 5.5241114699822666e-08,
"loss": 2.0709,
"step": 1900
},
{
"epoch": 2.081967213114754,
"grad_norm": 1.980231294589721,
"learning_rate": 5.508017751734168e-08,
"loss": 2.008,
"step": 1905
},
{
"epoch": 2.087431693989071,
"grad_norm": 1.8517464515604878,
"learning_rate": 5.492374553279633e-08,
"loss": 2.0203,
"step": 1910
},
{
"epoch": 2.092896174863388,
"grad_norm": 1.84630325467075,
"learning_rate": 5.477170492195204e-08,
"loss": 2.0385,
"step": 1915
},
{
"epoch": 2.098360655737705,
"grad_norm": 1.8768394234332548,
"learning_rate": 5.46239444240393e-08,
"loss": 2.0187,
"step": 1920
},
{
"epoch": 2.1038251366120218,
"grad_norm": 1.8943060640364853,
"learning_rate": 5.4480355291094704e-08,
"loss": 2.0574,
"step": 1925
},
{
"epoch": 2.109289617486339,
"grad_norm": 1.8863483174705893,
"learning_rate": 5.4340831238162615e-08,
"loss": 2.0217,
"step": 1930
},
{
"epoch": 2.1147540983606556,
"grad_norm": 1.8885742771135787,
"learning_rate": 5.420526839434506e-08,
"loss": 2.0538,
"step": 1935
},
{
"epoch": 2.120218579234973,
"grad_norm": 1.8210903752712588,
"learning_rate": 5.4073565254687946e-08,
"loss": 2.0324,
"step": 1940
},
{
"epoch": 2.1256830601092895,
"grad_norm": 1.8278501741427702,
"learning_rate": 5.3945622632891495e-08,
"loss": 2.0376,
"step": 1945
},
{
"epoch": 2.1311475409836067,
"grad_norm": 1.8851985007280183,
"learning_rate": 5.382134361483329e-08,
"loss": 2.0602,
"step": 1950
},
{
"epoch": 2.1366120218579234,
"grad_norm": 1.8672742611841686,
"learning_rate": 5.370063351289204e-08,
"loss": 2.0443,
"step": 1955
},
{
"epoch": 2.1420765027322406,
"grad_norm": 1.9023532236989618,
"learning_rate": 5.358339982106074e-08,
"loss": 2.0178,
"step": 1960
},
{
"epoch": 2.1475409836065573,
"grad_norm": 1.8533754595108112,
"learning_rate": 5.346955217083767e-08,
"loss": 2.0289,
"step": 1965
},
{
"epoch": 2.1530054644808745,
"grad_norm": 1.8751406718039245,
"learning_rate": 5.335900228788407e-08,
"loss": 2.0258,
"step": 1970
},
{
"epoch": 2.158469945355191,
"grad_norm": 1.911401329876507,
"learning_rate": 5.3251663949437266e-08,
"loss": 2.0621,
"step": 1975
},
{
"epoch": 2.1639344262295084,
"grad_norm": 1.8780553903992336,
"learning_rate": 5.3147452942468386e-08,
"loss": 1.9947,
"step": 1980
},
{
"epoch": 2.169398907103825,
"grad_norm": 1.9417700354104075,
"learning_rate": 5.3046287022573567e-08,
"loss": 2.0627,
"step": 1985
},
{
"epoch": 2.1748633879781423,
"grad_norm": 1.9335794456687536,
"learning_rate": 5.2948085873588114e-08,
"loss": 2.0621,
"step": 1990
},
{
"epoch": 2.180327868852459,
"grad_norm": 1.8441134776475825,
"learning_rate": 5.2852771067912865e-08,
"loss": 2.0741,
"step": 1995
},
{
"epoch": 2.185792349726776,
"grad_norm": 1.93007244053263,
"learning_rate": 5.276026602754233e-08,
"loss": 2.0297,
"step": 2000
},
{
"epoch": 2.185792349726776,
"eval_loss": 2.2766480445861816,
"eval_runtime": 75.0721,
"eval_samples_per_second": 86.677,
"eval_steps_per_second": 0.679,
"step": 2000
},
{
"epoch": 2.191256830601093,
"grad_norm": 1.8271236407081135,
"learning_rate": 5.267049598578416e-08,
"loss": 1.998,
"step": 2005
},
{
"epoch": 2.19672131147541,
"grad_norm": 1.9080141032090714,
"learning_rate": 5.258338794965976e-08,
"loss": 2.0317,
"step": 2010
},
{
"epoch": 2.202185792349727,
"grad_norm": 1.9296381326844356,
"learning_rate": 5.2498870662975855e-08,
"loss": 2.0527,
"step": 2015
},
{
"epoch": 2.2076502732240435,
"grad_norm": 1.8649383667959,
"learning_rate": 5.241687457005712e-08,
"loss": 2.0167,
"step": 2020
},
{
"epoch": 2.2131147540983607,
"grad_norm": 1.8733130363096773,
"learning_rate": 5.233733178012981e-08,
"loss": 2.0553,
"step": 2025
},
{
"epoch": 2.2185792349726774,
"grad_norm": 1.8744906953041462,
"learning_rate": 5.226017603234672e-08,
"loss": 2.0345,
"step": 2030
},
{
"epoch": 2.2240437158469946,
"grad_norm": 1.853006039099042,
"learning_rate": 5.2185342661443896e-08,
"loss": 1.9966,
"step": 2035
},
{
"epoch": 2.2295081967213113,
"grad_norm": 1.8910360796325498,
"learning_rate": 5.211276856401939e-08,
"loss": 2.0135,
"step": 2040
},
{
"epoch": 2.2349726775956285,
"grad_norm": 1.8514291560164504,
"learning_rate": 5.2042392165424757e-08,
"loss": 2.0205,
"step": 2045
},
{
"epoch": 2.240437158469945,
"grad_norm": 1.8446701052221985,
"learning_rate": 5.197415338725999e-08,
"loss": 2.0301,
"step": 2050
},
{
"epoch": 2.2459016393442623,
"grad_norm": 1.8588932533873443,
"learning_rate": 5.1907993615462615e-08,
"loss": 2.0287,
"step": 2055
},
{
"epoch": 2.251366120218579,
"grad_norm": 1.8963789966982134,
"learning_rate": 5.1843855668982e-08,
"loss": 2.0719,
"step": 2060
},
{
"epoch": 2.2568306010928962,
"grad_norm": 1.9044908060597479,
"learning_rate": 5.17816837690297e-08,
"loss": 1.9721,
"step": 2065
},
{
"epoch": 2.262295081967213,
"grad_norm": 1.9295350690511475,
"learning_rate": 5.172142350889727e-08,
"loss": 2.0225,
"step": 2070
},
{
"epoch": 2.26775956284153,
"grad_norm": 1.9304743860423463,
"learning_rate": 5.166302182433254e-08,
"loss": 2.0263,
"step": 2075
},
{
"epoch": 2.273224043715847,
"grad_norm": 1.8671651062857888,
"learning_rate": 5.160642696446577e-08,
"loss": 2.0241,
"step": 2080
},
{
"epoch": 2.278688524590164,
"grad_norm": 1.8420650638603713,
"learning_rate": 5.155158846327734e-08,
"loss": 2.0206,
"step": 2085
},
{
"epoch": 2.2841530054644807,
"grad_norm": 1.8005272409919932,
"learning_rate": 5.149845711159822e-08,
"loss": 2.0365,
"step": 2090
},
{
"epoch": 2.289617486338798,
"grad_norm": 1.857808778071221,
"learning_rate": 5.144698492963522e-08,
"loss": 2.0911,
"step": 2095
},
{
"epoch": 2.2950819672131146,
"grad_norm": 1.9093915008013214,
"learning_rate": 5.139712514001258e-08,
"loss": 2.0428,
"step": 2100
},
{
"epoch": 2.300546448087432,
"grad_norm": 1.8578270044645933,
"learning_rate": 5.134883214132186e-08,
"loss": 2.0124,
"step": 2105
},
{
"epoch": 2.3060109289617485,
"grad_norm": 1.822793058548045,
"learning_rate": 5.130206148217218e-08,
"loss": 2.0746,
"step": 2110
},
{
"epoch": 2.3114754098360657,
"grad_norm": 1.8442935114306909,
"learning_rate": 5.12567698357328e-08,
"loss": 2.0444,
"step": 2115
},
{
"epoch": 2.3169398907103824,
"grad_norm": 1.9739163571989773,
"learning_rate": 5.1212914974760244e-08,
"loss": 2.0435,
"step": 2120
},
{
"epoch": 2.3224043715846996,
"grad_norm": 1.8469690810911081,
"learning_rate": 5.117045574710235e-08,
"loss": 2.0545,
"step": 2125
},
{
"epoch": 2.3278688524590163,
"grad_norm": 1.8661244475654946,
"learning_rate": 5.112935205167153e-08,
"loss": 2.0058,
"step": 2130
},
{
"epoch": 2.3333333333333335,
"grad_norm": 1.8937750326566802,
"learning_rate": 5.108956481487976e-08,
"loss": 2.0293,
"step": 2135
},
{
"epoch": 2.33879781420765,
"grad_norm": 1.9208419450347225,
"learning_rate": 5.105105596752788e-08,
"loss": 2.0414,
"step": 2140
},
{
"epoch": 2.3442622950819674,
"grad_norm": 2.1121282228006555,
"learning_rate": 5.101378842214193e-08,
"loss": 2.0869,
"step": 2145
},
{
"epoch": 2.349726775956284,
"grad_norm": 1.8848256373264323,
"learning_rate": 5.0977726050749185e-08,
"loss": 2.0614,
"step": 2150
},
{
"epoch": 2.3551912568306013,
"grad_norm": 1.8507097430278243,
"learning_rate": 5.094283366308685e-08,
"loss": 2.0249,
"step": 2155
},
{
"epoch": 2.360655737704918,
"grad_norm": 1.8890593768953334,
"learning_rate": 5.0909076985236385e-08,
"loss": 2.0068,
"step": 2160
},
{
"epoch": 2.366120218579235,
"grad_norm": 1.8651383954059584,
"learning_rate": 5.0876422638676395e-08,
"loss": 2.0044,
"step": 2165
},
{
"epoch": 2.371584699453552,
"grad_norm": 1.860146100854827,
"learning_rate": 5.084483811974733e-08,
"loss": 2.054,
"step": 2170
},
{
"epoch": 2.3770491803278686,
"grad_norm": 1.7761767522433785,
"learning_rate": 5.0814291779521236e-08,
"loss": 2.0229,
"step": 2175
},
{
"epoch": 2.3825136612021858,
"grad_norm": 1.8386788502881166,
"learning_rate": 5.078475280406979e-08,
"loss": 2.0662,
"step": 2180
},
{
"epoch": 2.387978142076503,
"grad_norm": 1.990375014859749,
"learning_rate": 5.075619119512409e-08,
"loss": 2.0393,
"step": 2185
},
{
"epoch": 2.3934426229508197,
"grad_norm": 2.2419457883038314,
"learning_rate": 5.0728577751119725e-08,
"loss": 2.0523,
"step": 2190
},
{
"epoch": 2.3989071038251364,
"grad_norm": 1.9472851198204904,
"learning_rate": 5.0701884048620594e-08,
"loss": 2.0433,
"step": 2195
},
{
"epoch": 2.4043715846994536,
"grad_norm": 1.8641814407570831,
"learning_rate": 5.067608242411532e-08,
"loss": 2.0487,
"step": 2200
},
{
"epoch": 2.4043715846994536,
"eval_loss": 2.276731014251709,
"eval_runtime": 75.0853,
"eval_samples_per_second": 86.661,
"eval_steps_per_second": 0.679,
"step": 2200
},
{
"epoch": 2.4098360655737707,
"grad_norm": 1.8849638877894628,
"learning_rate": 5.065114595617981e-08,
"loss": 2.0449,
"step": 2205
},
{
"epoch": 2.4153005464480874,
"grad_norm": 1.897559223397183,
"learning_rate": 5.0627048448e-08,
"loss": 2.0172,
"step": 2210
},
{
"epoch": 2.420765027322404,
"grad_norm": 1.8881686050860271,
"learning_rate": 5.060376441024851e-08,
"loss": 2.0104,
"step": 2215
},
{
"epoch": 2.4262295081967213,
"grad_norm": 1.8760582543927924,
"learning_rate": 5.0581269044309416e-08,
"loss": 2.0514,
"step": 2220
},
{
"epoch": 2.431693989071038,
"grad_norm": 1.8590736579277904,
"learning_rate": 5.055953822584505e-08,
"loss": 2.0065,
"step": 2225
},
{
"epoch": 2.4371584699453552,
"grad_norm": 2.014653467507829,
"learning_rate": 5.0538548488699095e-08,
"loss": 2.0011,
"step": 2230
},
{
"epoch": 2.442622950819672,
"grad_norm": 2.0013649829205202,
"learning_rate": 5.0518277009130157e-08,
"loss": 2.0858,
"step": 2235
},
{
"epoch": 2.448087431693989,
"grad_norm": 1.8662711132468726,
"learning_rate": 5.0498701590370246e-08,
"loss": 2.0186,
"step": 2240
},
{
"epoch": 2.453551912568306,
"grad_norm": 1.9004909274072246,
"learning_rate": 5.047980064750245e-08,
"loss": 2.0112,
"step": 2245
},
{
"epoch": 2.459016393442623,
"grad_norm": 1.8857990524183288,
"learning_rate": 5.04615531926523e-08,
"loss": 2.0886,
"step": 2250
},
{
"epoch": 2.4644808743169397,
"grad_norm": 1.8001269753111797,
"learning_rate": 5.04439388204875e-08,
"loss": 1.9974,
"step": 2255
},
{
"epoch": 2.469945355191257,
"grad_norm": 1.8470988845468073,
"learning_rate": 5.042693769402049e-08,
"loss": 1.9826,
"step": 2260
},
{
"epoch": 2.4754098360655736,
"grad_norm": 1.8736758608587534,
"learning_rate": 5.041053053070867e-08,
"loss": 2.0697,
"step": 2265
},
{
"epoch": 2.480874316939891,
"grad_norm": 1.9221103936145996,
"learning_rate": 5.039469858884701e-08,
"loss": 2.0596,
"step": 2270
},
{
"epoch": 2.4863387978142075,
"grad_norm": 1.8924072367922147,
"learning_rate": 5.037942365424796e-08,
"loss": 2.0233,
"step": 2275
},
{
"epoch": 2.4918032786885247,
"grad_norm": 1.8504333814599807,
"learning_rate": 5.036468802720349e-08,
"loss": 2.0577,
"step": 2280
},
{
"epoch": 2.4972677595628414,
"grad_norm": 1.846494429919382,
"learning_rate": 5.035047450972435e-08,
"loss": 2.0249,
"step": 2285
},
{
"epoch": 2.5027322404371586,
"grad_norm": 1.8525565278611498,
"learning_rate": 5.033676639305158e-08,
"loss": 2.0432,
"step": 2290
},
{
"epoch": 2.5081967213114753,
"grad_norm": 1.988134266951729,
"learning_rate": 5.0323547445435455e-08,
"loss": 2.0604,
"step": 2295
},
{
"epoch": 2.5136612021857925,
"grad_norm": 1.858473570561303,
"learning_rate": 5.0310801900177e-08,
"loss": 2.0029,
"step": 2300
},
{
"epoch": 2.519125683060109,
"grad_norm": 1.8897075095235156,
"learning_rate": 5.029851444392739e-08,
"loss": 2.0182,
"step": 2305
},
{
"epoch": 2.5245901639344264,
"grad_norm": 1.9818990480911667,
"learning_rate": 5.028667020524067e-08,
"loss": 1.9902,
"step": 2310
},
{
"epoch": 2.530054644808743,
"grad_norm": 1.8548937980299227,
"learning_rate": 5.027525474337505e-08,
"loss": 2.0113,
"step": 2315
},
{
"epoch": 2.5355191256830603,
"grad_norm": 1.9033985343889175,
"learning_rate": 5.0264254037338365e-08,
"loss": 2.0591,
"step": 2320
},
{
"epoch": 2.540983606557377,
"grad_norm": 1.8954918019108078,
"learning_rate": 5.025365447517326e-08,
"loss": 2.0424,
"step": 2325
},
{
"epoch": 2.546448087431694,
"grad_norm": 1.8869766835100785,
"learning_rate": 5.024344284347762e-08,
"loss": 2.03,
"step": 2330
},
{
"epoch": 2.551912568306011,
"grad_norm": 1.8663624978318183,
"learning_rate": 5.023360631715606e-08,
"loss": 1.976,
"step": 2335
},
{
"epoch": 2.557377049180328,
"grad_norm": 1.8371733503594865,
"learning_rate": 5.0224132449398005e-08,
"loss": 2.0441,
"step": 2340
},
{
"epoch": 2.5628415300546448,
"grad_norm": 1.9433496190704163,
"learning_rate": 5.0215009161878455e-08,
"loss": 2.0678,
"step": 2345
},
{
"epoch": 2.5683060109289615,
"grad_norm": 1.9523689339991457,
"learning_rate": 5.020622473517704e-08,
"loss": 2.0311,
"step": 2350
},
{
"epoch": 2.5737704918032787,
"grad_norm": 1.8890575883757943,
"learning_rate": 5.0197767799411424e-08,
"loss": 2.0454,
"step": 2355
},
{
"epoch": 2.579234972677596,
"grad_norm": 1.9102594514962234,
"learning_rate": 5.0189627325081046e-08,
"loss": 2.0324,
"step": 2360
},
{
"epoch": 2.5846994535519126,
"grad_norm": 1.8390946791204932,
"learning_rate": 5.018179261411716e-08,
"loss": 2.0238,
"step": 2365
},
{
"epoch": 2.5901639344262293,
"grad_norm": 1.9030836331353156,
"learning_rate": 5.0174253291135456e-08,
"loss": 2.0424,
"step": 2370
},
{
"epoch": 2.5956284153005464,
"grad_norm": 1.9051341411136902,
"learning_rate": 5.016699929488718e-08,
"loss": 2.0464,
"step": 2375
},
{
"epoch": 2.6010928961748636,
"grad_norm": 1.8905549632374719,
"learning_rate": 5.016002086990525e-08,
"loss": 2.0401,
"step": 2380
},
{
"epoch": 2.6065573770491803,
"grad_norm": 1.8720851254061621,
"learning_rate": 5.015330855834148e-08,
"loss": 2.0313,
"step": 2385
},
{
"epoch": 2.612021857923497,
"grad_norm": 1.8630940777989557,
"learning_rate": 5.014685319199122e-08,
"loss": 2.0418,
"step": 2390
},
{
"epoch": 2.6174863387978142,
"grad_norm": 1.9536600782037399,
"learning_rate": 5.014064588450203e-08,
"loss": 2.0331,
"step": 2395
},
{
"epoch": 2.6229508196721314,
"grad_norm": 1.8537695794419704,
"learning_rate": 5.013467802376257e-08,
"loss": 2.0329,
"step": 2400
},
{
"epoch": 2.6229508196721314,
"eval_loss": 2.276965618133545,
"eval_runtime": 75.085,
"eval_samples_per_second": 86.662,
"eval_steps_per_second": 0.679,
"step": 2400
},
{
"epoch": 2.628415300546448,
"grad_norm": 1.856446433994958,
"learning_rate": 5.0128941264468425e-08,
"loss": 2.059,
"step": 2405
},
{
"epoch": 2.633879781420765,
"grad_norm": 1.8864099698250834,
"learning_rate": 5.012342752086127e-08,
"loss": 2.0366,
"step": 2410
},
{
"epoch": 2.639344262295082,
"grad_norm": 1.8965062954857936,
"learning_rate": 5.011812895963815e-08,
"loss": 2.0178,
"step": 2415
},
{
"epoch": 2.644808743169399,
"grad_norm": 1.9283596089907955,
"learning_rate": 5.011303799302737e-08,
"loss": 2.0664,
"step": 2420
},
{
"epoch": 2.650273224043716,
"grad_norm": 1.8806907065516518,
"learning_rate": 5.0108147272027865e-08,
"loss": 2.0187,
"step": 2425
},
{
"epoch": 2.6557377049180326,
"grad_norm": 1.9156510863376972,
"learning_rate": 5.0103449679808754e-08,
"loss": 2.0101,
"step": 2430
},
{
"epoch": 2.66120218579235,
"grad_norm": 2.126231497464025,
"learning_rate": 5.009893832526587e-08,
"loss": 1.9974,
"step": 2435
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1.9052967091959634,
"learning_rate": 5.0094606536732234e-08,
"loss": 2.0565,
"step": 2440
},
{
"epoch": 2.6721311475409837,
"grad_norm": 1.8394704411303013,
"learning_rate": 5.009044785583931e-08,
"loss": 2.0296,
"step": 2445
},
{
"epoch": 2.6775956284153004,
"grad_norm": 1.9096372006865225,
"learning_rate": 5.008645603152607e-08,
"loss": 2.0317,
"step": 2450
},
{
"epoch": 2.6830601092896176,
"grad_norm": 1.8523989798638827,
"learning_rate": 5.0082625014192866e-08,
"loss": 2.0261,
"step": 2455
},
{
"epoch": 2.6885245901639343,
"grad_norm": 1.8569921839621404,
"learning_rate": 5.007894894999717e-08,
"loss": 2.005,
"step": 2460
},
{
"epoch": 2.6939890710382515,
"grad_norm": 1.8836826290467388,
"learning_rate": 5.0075422175288365e-08,
"loss": 2.0464,
"step": 2465
},
{
"epoch": 2.699453551912568,
"grad_norm": 1.820199895319608,
"learning_rate": 5.007203921117863e-08,
"loss": 1.9825,
"step": 2470
},
{
"epoch": 2.7049180327868854,
"grad_norm": 1.904515026457721,
"learning_rate": 5.006879475824728e-08,
"loss": 2.0278,
"step": 2475
},
{
"epoch": 2.710382513661202,
"grad_norm": 1.943681983281218,
"learning_rate": 5.006568369137572e-08,
"loss": 2.0353,
"step": 2480
},
{
"epoch": 2.7158469945355193,
"grad_norm": 1.8593463289638106,
"learning_rate": 5.00627010547103e-08,
"loss": 2.0444,
"step": 2485
},
{
"epoch": 2.721311475409836,
"grad_norm": 1.8449163921598035,
"learning_rate": 5.005984205675053e-08,
"loss": 2.0289,
"step": 2490
},
{
"epoch": 2.726775956284153,
"grad_norm": 1.9258062804823874,
"learning_rate": 5.005710206555992e-08,
"loss": 1.9806,
"step": 2495
},
{
"epoch": 2.73224043715847,
"grad_norm": 1.886984272428234,
"learning_rate": 5.0054476604096995e-08,
"loss": 2.0158,
"step": 2500
},
{
"epoch": 2.737704918032787,
"grad_norm": 2.0315158822636548,
"learning_rate": 5.0051961345663824e-08,
"loss": 2.0218,
"step": 2505
},
{
"epoch": 2.7431693989071038,
"grad_norm": 1.8060895670908574,
"learning_rate": 5.0049552109469755e-08,
"loss": 2.0242,
"step": 2510
},
{
"epoch": 2.748633879781421,
"grad_norm": 1.8838071016430706,
"learning_rate": 5.004724485630778e-08,
"loss": 2.0522,
"step": 2515
},
{
"epoch": 2.7540983606557377,
"grad_norm": 1.8537277873432774,
"learning_rate": 5.004503568434121e-08,
"loss": 1.9872,
"step": 2520
},
{
"epoch": 2.7595628415300544,
"grad_norm": 1.9906937494224455,
"learning_rate": 5.004292082499825e-08,
"loss": 2.0369,
"step": 2525
},
{
"epoch": 2.7650273224043715,
"grad_norm": 1.8315661557574987,
"learning_rate": 5.0040896638972245e-08,
"loss": 2.0347,
"step": 2530
},
{
"epoch": 2.7704918032786887,
"grad_norm": 1.9006118838371153,
"learning_rate": 5.00389596123252e-08,
"loss": 2.0747,
"step": 2535
},
{
"epoch": 2.7759562841530054,
"grad_norm": 1.9282485545253067,
"learning_rate": 5.003710635269248e-08,
"loss": 2.0238,
"step": 2540
},
{
"epoch": 2.781420765027322,
"grad_norm": 1.8801002631728025,
"learning_rate": 5.0035333585586396e-08,
"loss": 2.0089,
"step": 2545
},
{
"epoch": 2.7868852459016393,
"grad_norm": 1.8790975179378258,
"learning_rate": 5.0033638150796495e-08,
"loss": 2.0503,
"step": 2550
},
{
"epoch": 2.7923497267759565,
"grad_norm": 1.8446856614496407,
"learning_rate": 5.0032016998884586e-08,
"loss": 2.0306,
"step": 2555
},
{
"epoch": 2.797814207650273,
"grad_norm": 1.9023341790959656,
"learning_rate": 5.003046718777224e-08,
"loss": 2.0464,
"step": 2560
},
{
"epoch": 2.80327868852459,
"grad_norm": 1.8356837164563038,
"learning_rate": 5.002898587941882e-08,
"loss": 2.0674,
"step": 2565
},
{
"epoch": 2.808743169398907,
"grad_norm": 1.9008745679241117,
"learning_rate": 5.002757033658803e-08,
"loss": 2.0508,
"step": 2570
},
{
"epoch": 2.8142076502732243,
"grad_norm": 1.8556179817129685,
"learning_rate": 5.0026217919700956e-08,
"loss": 2.0161,
"step": 2575
},
{
"epoch": 2.819672131147541,
"grad_norm": 1.8898728370320337,
"learning_rate": 5.0024926083773705e-08,
"loss": 2.0484,
"step": 2580
},
{
"epoch": 2.8251366120218577,
"grad_norm": 1.8869809093319543,
"learning_rate": 5.002369237543775e-08,
"loss": 2.0164,
"step": 2585
},
{
"epoch": 2.830601092896175,
"grad_norm": 2.0230326546469355,
"learning_rate": 5.0022514430041064e-08,
"loss": 2.035,
"step": 2590
},
{
"epoch": 2.836065573770492,
"grad_norm": 1.8582179431821813,
"learning_rate": 5.002138996882823e-08,
"loss": 2.0064,
"step": 2595
},
{
"epoch": 2.841530054644809,
"grad_norm": 1.942996732626371,
"learning_rate": 5.002031679619775e-08,
"loss": 2.0213,
"step": 2600
},
{
"epoch": 2.841530054644809,
"eval_loss": 2.276575803756714,
"eval_runtime": 75.1591,
"eval_samples_per_second": 86.576,
"eval_steps_per_second": 0.679,
"step": 2600
},
{
"epoch": 2.8469945355191255,
"grad_norm": 1.8406548791619128,
"learning_rate": 5.0019292797034756e-08,
"loss": 2.0239,
"step": 2605
},
{
"epoch": 2.8524590163934427,
"grad_norm": 1.8433943845894334,
"learning_rate": 5.001831593411739e-08,
"loss": 2.0306,
"step": 2610
},
{
"epoch": 2.8579234972677594,
"grad_norm": 1.9214309861779986,
"learning_rate": 5.0017384245595145e-08,
"loss": 2.0792,
"step": 2615
},
{
"epoch": 2.8633879781420766,
"grad_norm": 1.892919325287394,
"learning_rate": 5.001649584253754e-08,
"loss": 2.0389,
"step": 2620
},
{
"epoch": 2.8688524590163933,
"grad_norm": 1.8352505329010071,
"learning_rate": 5.001564890655143e-08,
"loss": 2.0385,
"step": 2625
},
{
"epoch": 2.8743169398907105,
"grad_norm": 1.9094968998206336,
"learning_rate": 5.001484168746532e-08,
"loss": 2.0307,
"step": 2630
},
{
"epoch": 2.879781420765027,
"grad_norm": 1.913583135594377,
"learning_rate": 5.001407250107926e-08,
"loss": 2.0251,
"step": 2635
},
{
"epoch": 2.8852459016393444,
"grad_norm": 1.8983404483614361,
"learning_rate": 5.001333972697852e-08,
"loss": 2.0251,
"step": 2640
},
{
"epoch": 2.890710382513661,
"grad_norm": 1.9615347509758865,
"learning_rate": 5.001264180640978e-08,
"loss": 2.0367,
"step": 2645
},
{
"epoch": 2.8961748633879782,
"grad_norm": 1.9624200439383404,
"learning_rate": 5.001197724021815e-08,
"loss": 2.062,
"step": 2650
},
{
"epoch": 2.901639344262295,
"grad_norm": 1.8696575109999418,
"learning_rate": 5.001134458684368e-08,
"loss": 2.0521,
"step": 2655
},
{
"epoch": 2.907103825136612,
"grad_norm": 1.8848143146406755,
"learning_rate": 5.001074246037584e-08,
"loss": 2.0034,
"step": 2660
},
{
"epoch": 2.912568306010929,
"grad_norm": 1.973139426778756,
"learning_rate": 5.001016952866467e-08,
"loss": 1.9532,
"step": 2665
},
{
"epoch": 2.918032786885246,
"grad_norm": 1.9504580432497,
"learning_rate": 5.000962451148704e-08,
"loss": 2.048,
"step": 2670
},
{
"epoch": 2.9234972677595628,
"grad_norm": 1.953413058357899,
"learning_rate": 5.0009106178766914e-08,
"loss": 2.0661,
"step": 2675
},
{
"epoch": 2.92896174863388,
"grad_norm": 1.881395664536309,
"learning_rate": 5.000861334884807e-08,
"loss": 2.022,
"step": 2680
},
{
"epoch": 2.9344262295081966,
"grad_norm": 1.8914817780033801,
"learning_rate": 5.0008144886818085e-08,
"loss": 1.9874,
"step": 2685
},
{
"epoch": 2.939890710382514,
"grad_norm": 2.017787415229173,
"learning_rate": 5.000769970288234e-08,
"loss": 2.0318,
"step": 2690
},
{
"epoch": 2.9453551912568305,
"grad_norm": 1.8421207610475552,
"learning_rate": 5.000727675078668e-08,
"loss": 2.0521,
"step": 2695
},
{
"epoch": 2.9508196721311473,
"grad_norm": 1.8659084624986955,
"learning_rate": 5.0006875026287623e-08,
"loss": 2.0089,
"step": 2700
},
{
"epoch": 2.9562841530054644,
"grad_norm": 1.9075873541304413,
"learning_rate": 5.0006493565668884e-08,
"loss": 2.0478,
"step": 2705
},
{
"epoch": 2.9617486338797816,
"grad_norm": 1.9745047266267015,
"learning_rate": 5.0006131444302976e-08,
"loss": 2.0439,
"step": 2710
},
{
"epoch": 2.9672131147540983,
"grad_norm": 1.8923014776736973,
"learning_rate": 5.000578777525686e-08,
"loss": 2.0554,
"step": 2715
},
{
"epoch": 2.972677595628415,
"grad_norm": 1.9080716218498992,
"learning_rate": 5.0005461707940365e-08,
"loss": 2.0322,
"step": 2720
},
{
"epoch": 2.978142076502732,
"grad_norm": 1.9393226464756443,
"learning_rate": 5.0005152426796475e-08,
"loss": 2.0324,
"step": 2725
},
{
"epoch": 2.9836065573770494,
"grad_norm": 1.8889314625477465,
"learning_rate": 5.000485915003216e-08,
"loss": 2.0421,
"step": 2730
},
{
"epoch": 2.989071038251366,
"grad_norm": 1.9331494885327474,
"learning_rate": 5.0004581128388925e-08,
"loss": 2.0398,
"step": 2735
},
{
"epoch": 2.994535519125683,
"grad_norm": 1.8595278802194335,
"learning_rate": 5.000431764395187e-08,
"loss": 2.0376,
"step": 2740
},
{
"epoch": 3.0,
"grad_norm": 1.8505999974369438,
"learning_rate": 5.000406800899633e-08,
"loss": 2.0272,
"step": 2745
},
{
"epoch": 3.0054644808743167,
"grad_norm": 1.8128081448976874,
"learning_rate": 5.00038315648711e-08,
"loss": 2.0134,
"step": 2750
},
{
"epoch": 3.010928961748634,
"grad_norm": 1.849783427721221,
"learning_rate": 5.000360768091725e-08,
"loss": 1.962,
"step": 2755
},
{
"epoch": 3.0163934426229506,
"grad_norm": 1.896562041216816,
"learning_rate": 5.0003395753421604e-08,
"loss": 2.0457,
"step": 2760
},
{
"epoch": 3.021857923497268,
"grad_norm": 1.9734010112151688,
"learning_rate": 5.0003195204603886e-08,
"loss": 2.0289,
"step": 2765
},
{
"epoch": 3.0273224043715845,
"grad_norm": 1.94494245556754,
"learning_rate": 5.000300548163672e-08,
"loss": 2.0502,
"step": 2770
},
{
"epoch": 3.0327868852459017,
"grad_norm": 1.853549834674588,
"learning_rate": 5.0002826055697557e-08,
"loss": 2.0073,
"step": 2775
},
{
"epoch": 3.0382513661202184,
"grad_norm": 1.9550895805921849,
"learning_rate": 5.000265642105161e-08,
"loss": 2.0578,
"step": 2780
},
{
"epoch": 3.0437158469945356,
"grad_norm": 1.9832050839540076,
"learning_rate": 5.0002496094165e-08,
"loss": 2.0593,
"step": 2785
},
{
"epoch": 3.0491803278688523,
"grad_norm": 1.8801978736078537,
"learning_rate": 5.000234461284729e-08,
"loss": 2.0796,
"step": 2790
},
{
"epoch": 3.0546448087431695,
"grad_norm": 1.880459663640695,
"learning_rate": 5.000220153542248e-08,
"loss": 2.0813,
"step": 2795
},
{
"epoch": 3.060109289617486,
"grad_norm": 1.8766167117396393,
"learning_rate": 5.000206643992788e-08,
"loss": 2.0559,
"step": 2800
},
{
"epoch": 3.060109289617486,
"eval_loss": 2.277146816253662,
"eval_runtime": 75.0877,
"eval_samples_per_second": 86.659,
"eval_steps_per_second": 0.679,
"step": 2800
},
{
"epoch": 3.0655737704918034,
"grad_norm": 1.9254106923649494,
"learning_rate": 5.000193892333986e-08,
"loss": 2.0661,
"step": 2805
},
{
"epoch": 3.07103825136612,
"grad_norm": 1.822299900106021,
"learning_rate": 5.000181860082585e-08,
"loss": 2.0499,
"step": 2810
},
{
"epoch": 3.0765027322404372,
"grad_norm": 1.8735243038644225,
"learning_rate": 5.0001705105021744e-08,
"loss": 2.0296,
"step": 2815
},
{
"epoch": 3.081967213114754,
"grad_norm": 1.9563218452911402,
"learning_rate": 5.000159808533418e-08,
"loss": 1.9812,
"step": 2820
},
{
"epoch": 3.087431693989071,
"grad_norm": 1.8334084094484262,
"learning_rate": 5.00014972072667e-08,
"loss": 2.0074,
"step": 2825
},
{
"epoch": 3.092896174863388,
"grad_norm": 1.8655682558825502,
"learning_rate": 5.000140215176936e-08,
"loss": 2.0072,
"step": 2830
},
{
"epoch": 3.098360655737705,
"grad_norm": 1.9205939797923823,
"learning_rate": 5.000131261461091e-08,
"loss": 1.9616,
"step": 2835
},
{
"epoch": 3.1038251366120218,
"grad_norm": 2.14258246365134,
"learning_rate": 5.0001228305773056e-08,
"loss": 2.0388,
"step": 2840
},
{
"epoch": 3.109289617486339,
"grad_norm": 1.909977704305264,
"learning_rate": 5.000114894886601e-08,
"loss": 2.0023,
"step": 2845
},
{
"epoch": 3.1147540983606556,
"grad_norm": 2.0206991852732394,
"learning_rate": 5.000107428056477e-08,
"loss": 2.0111,
"step": 2850
},
{
"epoch": 3.120218579234973,
"grad_norm": 1.8666014246751432,
"learning_rate": 5.000100405006557e-08,
"loss": 2.0219,
"step": 2855
},
{
"epoch": 3.1256830601092895,
"grad_norm": 1.9352070214880581,
"learning_rate": 5.0000938018561714e-08,
"loss": 2.029,
"step": 2860
},
{
"epoch": 3.1311475409836067,
"grad_norm": 1.8998730338754464,
"learning_rate": 5.0000875958738443e-08,
"loss": 2.014,
"step": 2865
},
{
"epoch": 3.1366120218579234,
"grad_norm": 1.93622910502082,
"learning_rate": 5.000081765428609e-08,
"loss": 2.0348,
"step": 2870
},
{
"epoch": 3.1420765027322406,
"grad_norm": 1.8895366176405546,
"learning_rate": 5.000076289943102e-08,
"loss": 2.0577,
"step": 2875
},
{
"epoch": 3.1475409836065573,
"grad_norm": 1.942718295521934,
"learning_rate": 5.0000711498483816e-08,
"loss": 2.0452,
"step": 2880
},
{
"epoch": 3.1530054644808745,
"grad_norm": 1.8568483287237603,
"learning_rate": 5.00006632654042e-08,
"loss": 2.0405,
"step": 2885
},
{
"epoch": 3.158469945355191,
"grad_norm": 1.8966452464630115,
"learning_rate": 5.00006180233821e-08,
"loss": 2.0307,
"step": 2890
},
{
"epoch": 3.1639344262295084,
"grad_norm": 1.8844492467485716,
"learning_rate": 5.000057560443445e-08,
"loss": 2.038,
"step": 2895
},
{
"epoch": 3.169398907103825,
"grad_norm": 1.9541049062507123,
"learning_rate": 5.000053584901716e-08,
"loss": 2.0324,
"step": 2900
},
{
"epoch": 3.1748633879781423,
"grad_norm": 1.8762220421293871,
"learning_rate": 5.0000498605651776e-08,
"loss": 2.0117,
"step": 2905
},
{
"epoch": 3.180327868852459,
"grad_norm": 1.8972364762038987,
"learning_rate": 5.000046373056645e-08,
"loss": 2.0539,
"step": 2910
},
{
"epoch": 3.185792349726776,
"grad_norm": 1.8830419259378766,
"learning_rate": 5.000043108735063e-08,
"loss": 2.0143,
"step": 2915
},
{
"epoch": 3.191256830601093,
"grad_norm": 1.8933082226852906,
"learning_rate": 5.000040054662314e-08,
"loss": 2.0245,
"step": 2920
},
{
"epoch": 3.19672131147541,
"grad_norm": 1.8584858417385766,
"learning_rate": 5.000037198571318e-08,
"loss": 1.9939,
"step": 2925
},
{
"epoch": 3.202185792349727,
"grad_norm": 1.9078023286100567,
"learning_rate": 5.000034528835373e-08,
"loss": 2.0418,
"step": 2930
},
{
"epoch": 3.2076502732240435,
"grad_norm": 1.902956383213903,
"learning_rate": 5.00003203443872e-08,
"loss": 2.0302,
"step": 2935
},
{
"epoch": 3.2131147540983607,
"grad_norm": 1.8818748470466278,
"learning_rate": 5.000029704948257e-08,
"loss": 2.0637,
"step": 2940
},
{
"epoch": 3.2185792349726774,
"grad_norm": 1.914518786096776,
"learning_rate": 5.0000275304863995e-08,
"loss": 2.014,
"step": 2945
},
{
"epoch": 3.2240437158469946,
"grad_norm": 1.9857131146213522,
"learning_rate": 5.000025501705019e-08,
"loss": 2.0159,
"step": 2950
},
{
"epoch": 3.2295081967213113,
"grad_norm": 1.8481404227503944,
"learning_rate": 5.000023609760444e-08,
"loss": 2.0345,
"step": 2955
},
{
"epoch": 3.2349726775956285,
"grad_norm": 1.9248498594561754,
"learning_rate": 5.00002184628948e-08,
"loss": 1.9741,
"step": 2960
},
{
"epoch": 3.240437158469945,
"grad_norm": 1.9138227507681809,
"learning_rate": 5.000020203386406e-08,
"loss": 1.9825,
"step": 2965
},
{
"epoch": 3.2459016393442623,
"grad_norm": 1.9553377832252659,
"learning_rate": 5.000018673580931e-08,
"loss": 2.0348,
"step": 2970
},
{
"epoch": 3.251366120218579,
"grad_norm": 1.923854238806126,
"learning_rate": 5.0000172498170615e-08,
"loss": 2.033,
"step": 2975
},
{
"epoch": 3.2568306010928962,
"grad_norm": 1.8966593579783744,
"learning_rate": 5.000015925432853e-08,
"loss": 2.0051,
"step": 2980
},
{
"epoch": 3.262295081967213,
"grad_norm": 1.8885418073350184,
"learning_rate": 5.000014694141023e-08,
"loss": 2.0325,
"step": 2985
},
{
"epoch": 3.26775956284153,
"grad_norm": 1.9005648234764283,
"learning_rate": 5.000013550010379e-08,
"loss": 2.0387,
"step": 2990
},
{
"epoch": 3.273224043715847,
"grad_norm": 1.8497186687415175,
"learning_rate": 5.0000124874480465e-08,
"loss": 1.9916,
"step": 2995
},
{
"epoch": 3.278688524590164,
"grad_norm": 1.9311355275570043,
"learning_rate": 5.000011501182461e-08,
"loss": 2.0543,
"step": 3000
},
{
"epoch": 3.278688524590164,
"eval_loss": 2.2772867679595947,
"eval_runtime": 75.0871,
"eval_samples_per_second": 86.659,
"eval_steps_per_second": 0.679,
"step": 3000
},
{
"epoch": 3.2841530054644807,
"grad_norm": 1.8490487579130825,
"learning_rate": 5.000010586247099e-08,
"loss": 2.0141,
"step": 3005
},
{
"epoch": 3.289617486338798,
"grad_norm": 1.8722847979242898,
"learning_rate": 5.0000097379649185e-08,
"loss": 2.0399,
"step": 3010
},
{
"epoch": 3.2950819672131146,
"grad_norm": 1.8920328829395436,
"learning_rate": 5.000008951933488e-08,
"loss": 2.0403,
"step": 3015
},
{
"epoch": 3.300546448087432,
"grad_norm": 1.859765946380407,
"learning_rate": 5.000008224010771e-08,
"loss": 2.0231,
"step": 3020
},
{
"epoch": 3.3060109289617485,
"grad_norm": 1.889873157845456,
"learning_rate": 5.0000075503015504e-08,
"loss": 2.0029,
"step": 3025
},
{
"epoch": 3.3114754098360657,
"grad_norm": 1.9194945076344194,
"learning_rate": 5.000006927144461e-08,
"loss": 2.0375,
"step": 3030
},
{
"epoch": 3.3169398907103824,
"grad_norm": 1.8949475106036582,
"learning_rate": 5.000006351099609e-08,
"loss": 2.0234,
"step": 3035
},
{
"epoch": 3.3224043715846996,
"grad_norm": 1.925413901133648,
"learning_rate": 5.0000058189367665e-08,
"loss": 2.0335,
"step": 3040
},
{
"epoch": 3.3278688524590163,
"grad_norm": 1.8637852431158481,
"learning_rate": 5.0000053276240954e-08,
"loss": 2.0339,
"step": 3045
},
{
"epoch": 3.3333333333333335,
"grad_norm": 1.990488877814686,
"learning_rate": 5.0000048743174075e-08,
"loss": 2.0116,
"step": 3050
},
{
"epoch": 3.33879781420765,
"grad_norm": 1.9066583759059983,
"learning_rate": 5.0000044563499215e-08,
"loss": 2.0752,
"step": 3055
},
{
"epoch": 3.3442622950819674,
"grad_norm": 1.9013799438501833,
"learning_rate": 5.0000040712225024e-08,
"loss": 2.0225,
"step": 3060
},
{
"epoch": 3.349726775956284,
"grad_norm": 1.8226817121910608,
"learning_rate": 5.000003716594369e-08,
"loss": 2.0035,
"step": 3065
},
{
"epoch": 3.3551912568306013,
"grad_norm": 1.8532253234195688,
"learning_rate": 5.000003390274239e-08,
"loss": 2.0492,
"step": 3070
},
{
"epoch": 3.360655737704918,
"grad_norm": 1.8666444656750065,
"learning_rate": 5.0000030902119114e-08,
"loss": 1.9977,
"step": 3075
},
{
"epoch": 3.366120218579235,
"grad_norm": 1.883761246140252,
"learning_rate": 5.000002814490251e-08,
"loss": 2.0615,
"step": 3080
},
{
"epoch": 3.371584699453552,
"grad_norm": 1.952894075205677,
"learning_rate": 5.000002561317571e-08,
"loss": 2.0141,
"step": 3085
},
{
"epoch": 3.3770491803278686,
"grad_norm": 1.8928059074027184,
"learning_rate": 5.000002329020387e-08,
"loss": 2.0403,
"step": 3090
},
{
"epoch": 3.3825136612021858,
"grad_norm": 2.0098225664920224,
"learning_rate": 5.0000021160365414e-08,
"loss": 2.0737,
"step": 3095
},
{
"epoch": 3.387978142076503,
"grad_norm": 1.8582907329607212,
"learning_rate": 5.000001920908665e-08,
"loss": 2.0323,
"step": 3100
},
{
"epoch": 3.3934426229508197,
"grad_norm": 1.9004514759105224,
"learning_rate": 5.000001742277974e-08,
"loss": 2.0378,
"step": 3105
},
{
"epoch": 3.3989071038251364,
"grad_norm": 1.9251008716345306,
"learning_rate": 5.0000015788783874e-08,
"loss": 1.9869,
"step": 3110
},
{
"epoch": 3.4043715846994536,
"grad_norm": 1.8991213194710543,
"learning_rate": 5.000001429530941e-08,
"loss": 2.0395,
"step": 3115
},
{
"epoch": 3.4098360655737707,
"grad_norm": 1.8526036080467823,
"learning_rate": 5.000001293138501e-08,
"loss": 2.0095,
"step": 3120
},
{
"epoch": 3.4153005464480874,
"grad_norm": 1.8346508560296197,
"learning_rate": 5.0000011686807445e-08,
"loss": 2.0067,
"step": 3125
},
{
"epoch": 3.420765027322404,
"grad_norm": 1.8407135510103516,
"learning_rate": 5.000001055209419e-08,
"loss": 2.0252,
"step": 3130
},
{
"epoch": 3.4262295081967213,
"grad_norm": 1.8721531260003674,
"learning_rate": 5.000000951843842e-08,
"loss": 2.0432,
"step": 3135
},
{
"epoch": 3.431693989071038,
"grad_norm": 1.9072265607352163,
"learning_rate": 5.0000008577666524e-08,
"loss": 2.0312,
"step": 3140
},
{
"epoch": 3.4371584699453552,
"grad_norm": 1.9177737469818847,
"learning_rate": 5.000000772219792e-08,
"loss": 2.0066,
"step": 3145
},
{
"epoch": 3.442622950819672,
"grad_norm": 1.9149658715997013,
"learning_rate": 5.000000694500704e-08,
"loss": 2.0064,
"step": 3150
},
{
"epoch": 3.448087431693989,
"grad_norm": 1.9406268887306055,
"learning_rate": 5.000000623958742e-08,
"loss": 2.0253,
"step": 3155
},
{
"epoch": 3.453551912568306,
"grad_norm": 1.93322985142905,
"learning_rate": 5.000000559991787e-08,
"loss": 2.0296,
"step": 3160
},
{
"epoch": 3.459016393442623,
"grad_norm": 2.006884694469749,
"learning_rate": 5.000000502043047e-08,
"loss": 2.015,
"step": 3165
},
{
"epoch": 3.4644808743169397,
"grad_norm": 1.973665285115433,
"learning_rate": 5.0000004495980446e-08,
"loss": 2.0621,
"step": 3170
},
{
"epoch": 3.469945355191257,
"grad_norm": 1.9098826344464872,
"learning_rate": 5.000000402181774e-08,
"loss": 2.0137,
"step": 3175
},
{
"epoch": 3.4754098360655736,
"grad_norm": 1.900637639917567,
"learning_rate": 5.000000359356028e-08,
"loss": 2.0411,
"step": 3180
},
{
"epoch": 3.480874316939891,
"grad_norm": 1.9657694744447054,
"learning_rate": 5.0000003207168756e-08,
"loss": 2.0667,
"step": 3185
},
{
"epoch": 3.4863387978142075,
"grad_norm": 1.8794891535447487,
"learning_rate": 5.000000285892296e-08,
"loss": 2.0421,
"step": 3190
},
{
"epoch": 3.4918032786885247,
"grad_norm": 1.9073660767776919,
"learning_rate": 5.000000254539948e-08,
"loss": 2.0722,
"step": 3195
},
{
"epoch": 3.4972677595628414,
"grad_norm": 1.9968851234028737,
"learning_rate": 5.000000226345078e-08,
"loss": 2.0317,
"step": 3200
},
{
"epoch": 3.4972677595628414,
"eval_loss": 2.2772328853607178,
"eval_runtime": 75.1937,
"eval_samples_per_second": 86.536,
"eval_steps_per_second": 0.678,
"step": 3200
},
{
"epoch": 3.5027322404371586,
"grad_norm": 1.9363915857414498,
"learning_rate": 5.000000201018557e-08,
"loss": 2.0378,
"step": 3205
},
{
"epoch": 3.5081967213114753,
"grad_norm": 1.9493487909740663,
"learning_rate": 5.0000001782950314e-08,
"loss": 2.0429,
"step": 3210
},
{
"epoch": 3.5136612021857925,
"grad_norm": 1.8974490684659184,
"learning_rate": 5.000000157931199e-08,
"loss": 2.0341,
"step": 3215
},
{
"epoch": 3.519125683060109,
"grad_norm": 1.8750804355544737,
"learning_rate": 5.000000139704186e-08,
"loss": 2.0143,
"step": 3220
},
{
"epoch": 3.5245901639344264,
"grad_norm": 1.8835309853885958,
"learning_rate": 5.0000001234100294e-08,
"loss": 2.0252,
"step": 3225
},
{
"epoch": 3.530054644808743,
"grad_norm": 1.9036966438501197,
"learning_rate": 5.000000108862262e-08,
"loss": 2.0031,
"step": 3230
},
{
"epoch": 3.5355191256830603,
"grad_norm": 1.8701728772297301,
"learning_rate": 5.0000000958905794e-08,
"loss": 2.0028,
"step": 3235
},
{
"epoch": 3.540983606557377,
"grad_norm": 1.8785086675187268,
"learning_rate": 5.000000084339605e-08,
"loss": 1.9671,
"step": 3240
},
{
"epoch": 3.546448087431694,
"grad_norm": 1.9287901930232905,
"learning_rate": 5.0000000740677285e-08,
"loss": 2.0464,
"step": 3245
},
{
"epoch": 3.551912568306011,
"grad_norm": 1.925166946388218,
"learning_rate": 5.00000006494603e-08,
"loss": 1.9629,
"step": 3250
},
{
"epoch": 3.557377049180328,
"grad_norm": 1.9147306157624264,
"learning_rate": 5.000000056857271e-08,
"loss": 2.0377,
"step": 3255
},
{
"epoch": 3.5628415300546448,
"grad_norm": 2.00912227135468,
"learning_rate": 5.0000000496949596e-08,
"loss": 2.0519,
"step": 3260
},
{
"epoch": 3.5683060109289615,
"grad_norm": 1.914393129097604,
"learning_rate": 5.000000043362476e-08,
"loss": 1.9921,
"step": 3265
},
{
"epoch": 3.5737704918032787,
"grad_norm": 1.8986536102948053,
"learning_rate": 5.000000037772264e-08,
"loss": 2.037,
"step": 3270
},
{
"epoch": 3.579234972677596,
"grad_norm": 2.1302629939845272,
"learning_rate": 5.000000032845078e-08,
"loss": 2.0352,
"step": 3275
},
{
"epoch": 3.5846994535519126,
"grad_norm": 1.9083903824546993,
"learning_rate": 5.0000000285092845e-08,
"loss": 2.0432,
"step": 3280
},
{
"epoch": 3.5901639344262293,
"grad_norm": 1.9795975235944003,
"learning_rate": 5.000000024700213e-08,
"loss": 2.0047,
"step": 3285
},
{
"epoch": 3.5956284153005464,
"grad_norm": 1.909947661859089,
"learning_rate": 5.000000021359558e-08,
"loss": 2.031,
"step": 3290
},
{
"epoch": 3.6010928961748636,
"grad_norm": 1.873647300121296,
"learning_rate": 5.000000018434823e-08,
"loss": 2.0427,
"step": 3295
},
{
"epoch": 3.6065573770491803,
"grad_norm": 1.8870603921175668,
"learning_rate": 5.000000015878808e-08,
"loss": 1.9943,
"step": 3300
},
{
"epoch": 3.612021857923497,
"grad_norm": 1.83764062220617,
"learning_rate": 5.000000013649137e-08,
"loss": 2.0278,
"step": 3305
},
{
"epoch": 3.6174863387978142,
"grad_norm": 1.8700657377845233,
"learning_rate": 5.0000000117078175e-08,
"loss": 2.016,
"step": 3310
},
{
"epoch": 3.6229508196721314,
"grad_norm": 1.9024345479748699,
"learning_rate": 5.000000010020843e-08,
"loss": 2.0335,
"step": 3315
},
{
"epoch": 3.628415300546448,
"grad_norm": 1.8905483742070606,
"learning_rate": 5.000000008557818e-08,
"loss": 2.018,
"step": 3320
},
{
"epoch": 3.633879781420765,
"grad_norm": 1.9223349161654961,
"learning_rate": 5.0000000072916214e-08,
"loss": 2.0213,
"step": 3325
},
{
"epoch": 3.639344262295082,
"grad_norm": 1.823567332382863,
"learning_rate": 5.000000006198092e-08,
"loss": 1.987,
"step": 3330
},
{
"epoch": 3.644808743169399,
"grad_norm": 1.8810678051906216,
"learning_rate": 5.00000000525574e-08,
"loss": 1.9769,
"step": 3335
},
{
"epoch": 3.650273224043716,
"grad_norm": 1.9204431005146232,
"learning_rate": 5.0000000044454894e-08,
"loss": 2.0674,
"step": 3340
},
{
"epoch": 3.6557377049180326,
"grad_norm": 1.8872295947781799,
"learning_rate": 5.000000003750432e-08,
"loss": 2.0109,
"step": 3345
},
{
"epoch": 3.66120218579235,
"grad_norm": 1.8893937179160833,
"learning_rate": 5.000000003155614e-08,
"loss": 2.0475,
"step": 3350
},
{
"epoch": 3.6666666666666665,
"grad_norm": 1.9582368602914404,
"learning_rate": 5.000000002647831e-08,
"loss": 2.0292,
"step": 3355
},
{
"epoch": 3.6721311475409837,
"grad_norm": 1.8759259337994865,
"learning_rate": 5.000000002215448e-08,
"loss": 2.0248,
"step": 3360
},
{
"epoch": 3.6775956284153004,
"grad_norm": 1.815903984549811,
"learning_rate": 5.0000000018482356e-08,
"loss": 2.0287,
"step": 3365
},
{
"epoch": 3.6830601092896176,
"grad_norm": 1.8747014733431713,
"learning_rate": 5.000000001537216e-08,
"loss": 2.0457,
"step": 3370
},
{
"epoch": 3.6885245901639343,
"grad_norm": 1.9458767620629445,
"learning_rate": 5.000000001274526e-08,
"loss": 2.0515,
"step": 3375
},
{
"epoch": 3.6939890710382515,
"grad_norm": 1.9268929400448993,
"learning_rate": 5.0000000010533005e-08,
"loss": 2.0511,
"step": 3380
},
{
"epoch": 3.699453551912568,
"grad_norm": 1.902962520090544,
"learning_rate": 5.0000000008675514e-08,
"loss": 2.0558,
"step": 3385
},
{
"epoch": 3.7049180327868854,
"grad_norm": 1.8229295875308797,
"learning_rate": 5.000000000712075e-08,
"loss": 2.0166,
"step": 3390
},
{
"epoch": 3.710382513661202,
"grad_norm": 1.836606346535709,
"learning_rate": 5.0000000005823554e-08,
"loss": 2.0403,
"step": 3395
},
{
"epoch": 3.7158469945355193,
"grad_norm": 1.9116742151193724,
"learning_rate": 5.0000000004744865e-08,
"loss": 1.988,
"step": 3400
},
{
"epoch": 3.7158469945355193,
"eval_loss": 2.2769548892974854,
"eval_runtime": 74.9791,
"eval_samples_per_second": 86.784,
"eval_steps_per_second": 0.68,
"step": 3400
},
{
"epoch": 3.721311475409836,
"grad_norm": 1.8962679275361203,
"learning_rate": 5.000000000385098e-08,
"loss": 2.0077,
"step": 3405
},
{
"epoch": 3.726775956284153,
"grad_norm": 1.8893473180089084,
"learning_rate": 5.0000000003112903e-08,
"loss": 2.0275,
"step": 3410
},
{
"epoch": 3.73224043715847,
"grad_norm": 1.9059398028165249,
"learning_rate": 5.0000000002505746e-08,
"loss": 2.0248,
"step": 3415
},
{
"epoch": 3.737704918032787,
"grad_norm": 1.9284706550848763,
"learning_rate": 5.000000000200822e-08,
"loss": 2.0841,
"step": 3420
},
{
"epoch": 3.7431693989071038,
"grad_norm": 1.8971305679426038,
"learning_rate": 5.000000000160219e-08,
"loss": 2.0205,
"step": 3425
},
{
"epoch": 3.748633879781421,
"grad_norm": 1.9639122926123413,
"learning_rate": 5.000000000127221e-08,
"loss": 2.0438,
"step": 3430
},
{
"epoch": 3.7540983606557377,
"grad_norm": 1.9660777954794344,
"learning_rate": 5.000000000100521e-08,
"loss": 2.0285,
"step": 3435
},
{
"epoch": 3.7595628415300544,
"grad_norm": 1.915135487255815,
"learning_rate": 5.000000000079017e-08,
"loss": 1.9938,
"step": 3440
},
{
"epoch": 3.7650273224043715,
"grad_norm": 1.9104288280645758,
"learning_rate": 5.000000000061779e-08,
"loss": 2.0109,
"step": 3445
},
{
"epoch": 3.7704918032786887,
"grad_norm": 1.9163223330431955,
"learning_rate": 5.0000000000480305e-08,
"loss": 2.0479,
"step": 3450
},
{
"epoch": 3.7759562841530054,
"grad_norm": 1.8942590608094447,
"learning_rate": 5.0000000000371217e-08,
"loss": 2.0265,
"step": 3455
},
{
"epoch": 3.781420765027322,
"grad_norm": 1.8979108959832878,
"learning_rate": 5.0000000000285143e-08,
"loss": 2.0483,
"step": 3460
},
{
"epoch": 3.7868852459016393,
"grad_norm": 2.0082325735504205,
"learning_rate": 5.000000000021761e-08,
"loss": 2.0249,
"step": 3465
},
{
"epoch": 3.7923497267759565,
"grad_norm": 1.9319079542553508,
"learning_rate": 5.0000000000164944e-08,
"loss": 2.039,
"step": 3470
},
{
"epoch": 3.797814207650273,
"grad_norm": 1.8962147679193577,
"learning_rate": 5.0000000000124134e-08,
"loss": 1.9694,
"step": 3475
},
{
"epoch": 3.80327868852459,
"grad_norm": 1.9119145259888968,
"learning_rate": 5.0000000000092715e-08,
"loss": 2.0416,
"step": 3480
},
{
"epoch": 3.808743169398907,
"grad_norm": 1.8560975935966715,
"learning_rate": 5.00000000000687e-08,
"loss": 2.0174,
"step": 3485
},
{
"epoch": 3.8142076502732243,
"grad_norm": 1.9664941712397381,
"learning_rate": 5.000000000005048e-08,
"loss": 2.0503,
"step": 3490
},
{
"epoch": 3.819672131147541,
"grad_norm": 1.8658008767578975,
"learning_rate": 5.0000000000036764e-08,
"loss": 2.0293,
"step": 3495
},
{
"epoch": 3.8251366120218577,
"grad_norm": 1.9253208530977064,
"learning_rate": 5.000000000002653e-08,
"loss": 2.0429,
"step": 3500
},
{
"epoch": 3.830601092896175,
"grad_norm": 1.8810005361263469,
"learning_rate": 5.000000000001895e-08,
"loss": 2.035,
"step": 3505
},
{
"epoch": 3.836065573770492,
"grad_norm": 1.893681875957613,
"learning_rate": 5.000000000001339e-08,
"loss": 2.0732,
"step": 3510
},
{
"epoch": 3.841530054644809,
"grad_norm": 1.9417607113095643,
"learning_rate": 5.0000000000009355e-08,
"loss": 2.002,
"step": 3515
},
{
"epoch": 3.8469945355191255,
"grad_norm": 1.9989745014112892,
"learning_rate": 5.000000000000646e-08,
"loss": 2.0525,
"step": 3520
},
{
"epoch": 3.8524590163934427,
"grad_norm": 1.8555402595578698,
"learning_rate": 5.0000000000004405e-08,
"loss": 2.0228,
"step": 3525
},
{
"epoch": 3.8579234972677594,
"grad_norm": 1.9137513054849469,
"learning_rate": 5.0000000000002956e-08,
"loss": 2.0657,
"step": 3530
},
{
"epoch": 3.8633879781420766,
"grad_norm": 1.9157898282989583,
"learning_rate": 5.0000000000001957e-08,
"loss": 2.0173,
"step": 3535
},
{
"epoch": 3.8688524590163933,
"grad_norm": 1.9273730542054064,
"learning_rate": 5.0000000000001275e-08,
"loss": 2.0529,
"step": 3540
},
{
"epoch": 3.8743169398907105,
"grad_norm": 1.8997473790640476,
"learning_rate": 5.000000000000082e-08,
"loss": 2.0464,
"step": 3545
},
{
"epoch": 3.879781420765027,
"grad_norm": 1.9722252114630803,
"learning_rate": 5.0000000000000514e-08,
"loss": 2.0219,
"step": 3550
},
{
"epoch": 3.8852459016393444,
"grad_norm": 1.9361513715190686,
"learning_rate": 5.0000000000000315e-08,
"loss": 2.0549,
"step": 3555
},
{
"epoch": 3.890710382513661,
"grad_norm": 1.9521279215167433,
"learning_rate": 5.000000000000019e-08,
"loss": 2.0019,
"step": 3560
},
{
"epoch": 3.8961748633879782,
"grad_norm": 1.8990353241401117,
"learning_rate": 5.000000000000011e-08,
"loss": 2.037,
"step": 3565
},
{
"epoch": 3.901639344262295,
"grad_norm": 1.8490026777793342,
"learning_rate": 5.0000000000000064e-08,
"loss": 2.0528,
"step": 3570
},
{
"epoch": 3.907103825136612,
"grad_norm": 2.004168101245223,
"learning_rate": 5.000000000000003e-08,
"loss": 2.0062,
"step": 3575
},
{
"epoch": 3.912568306010929,
"grad_norm": 1.8584030836568644,
"learning_rate": 5.000000000000002e-08,
"loss": 2.0026,
"step": 3580
},
{
"epoch": 3.918032786885246,
"grad_norm": 1.8750862900064005,
"learning_rate": 5.0000000000000004e-08,
"loss": 2.0304,
"step": 3585
},
{
"epoch": 3.9234972677595628,
"grad_norm": 1.9298592977310705,
"learning_rate": 5.0000000000000004e-08,
"loss": 2.0262,
"step": 3590
},
{
"epoch": 3.92896174863388,
"grad_norm": 1.9261861281030954,
"learning_rate": 5e-08,
"loss": 2.0747,
"step": 3595
},
{
"epoch": 3.9344262295081966,
"grad_norm": 1.9012633598619333,
"learning_rate": 5e-08,
"loss": 2.0355,
"step": 3600
},
{
"epoch": 3.9344262295081966,
"eval_loss": 2.277177333831787,
"eval_runtime": 75.1005,
"eval_samples_per_second": 86.644,
"eval_steps_per_second": 0.679,
"step": 3600
},
{
"epoch": 3.939890710382514,
"grad_norm": 1.9662605743553438,
"learning_rate": 5e-08,
"loss": 2.0272,
"step": 3605
},
{
"epoch": 3.9453551912568305,
"grad_norm": 1.8777765308314378,
"learning_rate": 5e-08,
"loss": 2.0574,
"step": 3610
},
{
"epoch": 3.9508196721311473,
"grad_norm": 1.9697643417177255,
"learning_rate": 5e-08,
"loss": 2.0504,
"step": 3615
},
{
"epoch": 3.9562841530054644,
"grad_norm": 1.91285486557523,
"learning_rate": 5e-08,
"loss": 2.0216,
"step": 3620
},
{
"epoch": 3.9617486338797816,
"grad_norm": 1.894324240473093,
"learning_rate": 5e-08,
"loss": 2.0108,
"step": 3625
},
{
"epoch": 3.9672131147540983,
"grad_norm": 1.9284412363816936,
"learning_rate": 5e-08,
"loss": 2.0038,
"step": 3630
},
{
"epoch": 3.972677595628415,
"grad_norm": 1.8376681173174465,
"learning_rate": 5e-08,
"loss": 2.021,
"step": 3635
},
{
"epoch": 3.978142076502732,
"grad_norm": 1.8629566090204688,
"learning_rate": 5e-08,
"loss": 2.0236,
"step": 3640
},
{
"epoch": 3.9836065573770494,
"grad_norm": 1.9846522235537283,
"learning_rate": 5e-08,
"loss": 2.024,
"step": 3645
},
{
"epoch": 3.989071038251366,
"grad_norm": 1.9025611361991746,
"learning_rate": 5e-08,
"loss": 2.0281,
"step": 3650
},
{
"epoch": 3.994535519125683,
"grad_norm": 1.9351822472092162,
"learning_rate": 5e-08,
"loss": 2.0184,
"step": 3655
},
{
"epoch": 4.0,
"grad_norm": 1.9623534464978543,
"learning_rate": 5e-08,
"loss": 1.9875,
"step": 3660
},
{
"epoch": 4.0,
"step": 3660,
"total_flos": 382536630927360.0,
"train_loss": 2.107023582497581,
"train_runtime": 13273.308,
"train_samples_per_second": 17.646,
"train_steps_per_second": 0.276
}
],
"logging_steps": 5,
"max_steps": 3660,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 200,
"total_flos": 382536630927360.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}