qwen2.5-14B-scipy-lora / trainer_state.json
zgce's picture
Upload 12 files
c34aa36 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997864616698697,
"eval_steps": 500,
"global_step": 2341,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002135383301302584,
"grad_norm": 0.17446549236774445,
"learning_rate": 4.999943721137594e-05,
"loss": 1.0426,
"num_input_tokens_seen": 127072,
"step": 5
},
{
"epoch": 0.004270766602605168,
"grad_norm": 0.19204697012901306,
"learning_rate": 4.999774887084225e-05,
"loss": 1.0578,
"num_input_tokens_seen": 281728,
"step": 10
},
{
"epoch": 0.0064061499039077515,
"grad_norm": 0.09412319213151932,
"learning_rate": 4.999493505441324e-05,
"loss": 0.6602,
"num_input_tokens_seen": 394304,
"step": 15
},
{
"epoch": 0.008541533205210335,
"grad_norm": 0.06181873008608818,
"learning_rate": 4.9990995888775614e-05,
"loss": 0.961,
"num_input_tokens_seen": 561888,
"step": 20
},
{
"epoch": 0.010676916506512918,
"grad_norm": 0.06059432402253151,
"learning_rate": 4.9985931551282785e-05,
"loss": 0.9722,
"num_input_tokens_seen": 722496,
"step": 25
},
{
"epoch": 0.012812299807815503,
"grad_norm": 0.05892226845026016,
"learning_rate": 4.997974226994687e-05,
"loss": 0.7008,
"num_input_tokens_seen": 850176,
"step": 30
},
{
"epoch": 0.014947683109118086,
"grad_norm": 0.0467025563120842,
"learning_rate": 4.9972428323428444e-05,
"loss": 0.9035,
"num_input_tokens_seen": 1017280,
"step": 35
},
{
"epoch": 0.01708306641042067,
"grad_norm": 0.04231492057442665,
"learning_rate": 4.996399004102397e-05,
"loss": 0.868,
"num_input_tokens_seen": 1199552,
"step": 40
},
{
"epoch": 0.019218449711723255,
"grad_norm": 0.05744878575205803,
"learning_rate": 4.9954427802651014e-05,
"loss": 0.7532,
"num_input_tokens_seen": 1343744,
"step": 45
},
{
"epoch": 0.021353833013025837,
"grad_norm": 0.04732651636004448,
"learning_rate": 4.9943742038831076e-05,
"loss": 0.7994,
"num_input_tokens_seen": 1513920,
"step": 50
},
{
"epoch": 0.02348921631432842,
"grad_norm": 0.06059388071298599,
"learning_rate": 4.993193323067027e-05,
"loss": 0.8082,
"num_input_tokens_seen": 1680096,
"step": 55
},
{
"epoch": 0.025624599615631006,
"grad_norm": 0.05906912684440613,
"learning_rate": 4.9919001909837625e-05,
"loss": 0.7103,
"num_input_tokens_seen": 1853312,
"step": 60
},
{
"epoch": 0.02775998291693359,
"grad_norm": 0.05951849743723869,
"learning_rate": 4.990494865854116e-05,
"loss": 0.6723,
"num_input_tokens_seen": 2007040,
"step": 65
},
{
"epoch": 0.029895366218236172,
"grad_norm": 0.05470618978142738,
"learning_rate": 4.9889774109501675e-05,
"loss": 0.6473,
"num_input_tokens_seen": 2127456,
"step": 70
},
{
"epoch": 0.032030749519538756,
"grad_norm": 0.052827127277851105,
"learning_rate": 4.987347894592426e-05,
"loss": 0.8123,
"num_input_tokens_seen": 2276992,
"step": 75
},
{
"epoch": 0.03416613282084134,
"grad_norm": 0.04660721495747566,
"learning_rate": 4.985606390146752e-05,
"loss": 0.8803,
"num_input_tokens_seen": 2463776,
"step": 80
},
{
"epoch": 0.036301516122143926,
"grad_norm": 0.047477494925260544,
"learning_rate": 4.983752976021058e-05,
"loss": 0.7062,
"num_input_tokens_seen": 2619296,
"step": 85
},
{
"epoch": 0.03843689942344651,
"grad_norm": 0.050083596259355545,
"learning_rate": 4.981787735661774e-05,
"loss": 0.7329,
"num_input_tokens_seen": 2807456,
"step": 90
},
{
"epoch": 0.040572282724749095,
"grad_norm": 0.07300040125846863,
"learning_rate": 4.9797107575500934e-05,
"loss": 0.708,
"num_input_tokens_seen": 2982592,
"step": 95
},
{
"epoch": 0.04270766602605167,
"grad_norm": 0.0638803094625473,
"learning_rate": 4.977522135197988e-05,
"loss": 0.686,
"num_input_tokens_seen": 3118176,
"step": 100
},
{
"epoch": 0.04484304932735426,
"grad_norm": 0.058782994747161865,
"learning_rate": 4.975221967144e-05,
"loss": 0.8154,
"num_input_tokens_seen": 3318528,
"step": 105
},
{
"epoch": 0.04697843262865684,
"grad_norm": 0.06658528745174408,
"learning_rate": 4.972810356948803e-05,
"loss": 0.7786,
"num_input_tokens_seen": 3446496,
"step": 110
},
{
"epoch": 0.04911381592995943,
"grad_norm": 0.05051959306001663,
"learning_rate": 4.9702874131905375e-05,
"loss": 0.848,
"num_input_tokens_seen": 3633536,
"step": 115
},
{
"epoch": 0.05124919923126201,
"grad_norm": 0.055378127843141556,
"learning_rate": 4.967653249459928e-05,
"loss": 0.6415,
"num_input_tokens_seen": 3797920,
"step": 120
},
{
"epoch": 0.053384582532564596,
"grad_norm": 0.053418923169374466,
"learning_rate": 4.9649079843551663e-05,
"loss": 0.885,
"num_input_tokens_seen": 3972288,
"step": 125
},
{
"epoch": 0.05551996583386718,
"grad_norm": 0.042733389884233475,
"learning_rate": 4.9620517414765685e-05,
"loss": 0.7594,
"num_input_tokens_seen": 4155520,
"step": 130
},
{
"epoch": 0.057655349135169766,
"grad_norm": 0.05269218608736992,
"learning_rate": 4.959084649421016e-05,
"loss": 0.5479,
"num_input_tokens_seen": 4319488,
"step": 135
},
{
"epoch": 0.059790732436472344,
"grad_norm": 0.06381044536828995,
"learning_rate": 4.9560068417761595e-05,
"loss": 0.7997,
"num_input_tokens_seen": 4473120,
"step": 140
},
{
"epoch": 0.06192611573777493,
"grad_norm": 0.09567151963710785,
"learning_rate": 4.952818457114411e-05,
"loss": 0.7524,
"num_input_tokens_seen": 4612768,
"step": 145
},
{
"epoch": 0.06406149903907751,
"grad_norm": 0.054249729961156845,
"learning_rate": 4.9495196389866995e-05,
"loss": 0.7524,
"num_input_tokens_seen": 4747680,
"step": 150
},
{
"epoch": 0.0661968823403801,
"grad_norm": 0.05188503488898277,
"learning_rate": 4.946110535916009e-05,
"loss": 0.7265,
"num_input_tokens_seen": 4943264,
"step": 155
},
{
"epoch": 0.06833226564168268,
"grad_norm": 0.05143864452838898,
"learning_rate": 4.942591301390695e-05,
"loss": 0.6025,
"num_input_tokens_seen": 5085408,
"step": 160
},
{
"epoch": 0.07046764894298527,
"grad_norm": 0.05344095826148987,
"learning_rate": 4.9389620938575695e-05,
"loss": 0.6499,
"num_input_tokens_seen": 5256288,
"step": 165
},
{
"epoch": 0.07260303224428785,
"grad_norm": 0.06509006023406982,
"learning_rate": 4.935223076714769e-05,
"loss": 0.723,
"num_input_tokens_seen": 5437312,
"step": 170
},
{
"epoch": 0.07473841554559044,
"grad_norm": 0.06998533755540848,
"learning_rate": 4.9313744183044e-05,
"loss": 0.7537,
"num_input_tokens_seen": 5568800,
"step": 175
},
{
"epoch": 0.07687379884689302,
"grad_norm": 0.05648740753531456,
"learning_rate": 4.927416291904955e-05,
"loss": 0.7523,
"num_input_tokens_seen": 5721568,
"step": 180
},
{
"epoch": 0.0790091821481956,
"grad_norm": 0.057639315724372864,
"learning_rate": 4.9233488757235145e-05,
"loss": 0.6245,
"num_input_tokens_seen": 5874336,
"step": 185
},
{
"epoch": 0.08114456544949819,
"grad_norm": 0.06750814616680145,
"learning_rate": 4.919172352887725e-05,
"loss": 0.7379,
"num_input_tokens_seen": 6031904,
"step": 190
},
{
"epoch": 0.08327994875080078,
"grad_norm": 0.0563182458281517,
"learning_rate": 4.914886911437547e-05,
"loss": 0.6706,
"num_input_tokens_seen": 6201152,
"step": 195
},
{
"epoch": 0.08541533205210335,
"grad_norm": 0.05895683914422989,
"learning_rate": 4.910492744316799e-05,
"loss": 0.6494,
"num_input_tokens_seen": 6356416,
"step": 200
},
{
"epoch": 0.08755071535340593,
"grad_norm": 0.05769870802760124,
"learning_rate": 4.905990049364461e-05,
"loss": 0.7371,
"num_input_tokens_seen": 6502272,
"step": 205
},
{
"epoch": 0.08968609865470852,
"grad_norm": 0.06006137281656265,
"learning_rate": 4.9013790293057714e-05,
"loss": 0.7105,
"num_input_tokens_seen": 6662432,
"step": 210
},
{
"epoch": 0.0918214819560111,
"grad_norm": 0.050949618220329285,
"learning_rate": 4.8966598917431036e-05,
"loss": 0.6886,
"num_input_tokens_seen": 6826048,
"step": 215
},
{
"epoch": 0.09395686525731368,
"grad_norm": 0.06648615002632141,
"learning_rate": 4.8918328491466106e-05,
"loss": 0.6842,
"num_input_tokens_seen": 6992928,
"step": 220
},
{
"epoch": 0.09609224855861627,
"grad_norm": 0.05005276948213577,
"learning_rate": 4.886898118844666e-05,
"loss": 0.733,
"num_input_tokens_seen": 7128704,
"step": 225
},
{
"epoch": 0.09822763185991885,
"grad_norm": 0.09183106571435928,
"learning_rate": 4.881855923014076e-05,
"loss": 0.5728,
"num_input_tokens_seen": 7266464,
"step": 230
},
{
"epoch": 0.10036301516122144,
"grad_norm": 0.05412563309073448,
"learning_rate": 4.876706488670077e-05,
"loss": 0.6706,
"num_input_tokens_seen": 7430912,
"step": 235
},
{
"epoch": 0.10249839846252402,
"grad_norm": 0.07257558405399323,
"learning_rate": 4.871450047656114e-05,
"loss": 0.6395,
"num_input_tokens_seen": 7560288,
"step": 240
},
{
"epoch": 0.10463378176382661,
"grad_norm": 0.058103807270526886,
"learning_rate": 4.866086836633403e-05,
"loss": 0.6815,
"num_input_tokens_seen": 7708480,
"step": 245
},
{
"epoch": 0.10676916506512919,
"grad_norm": 0.0581187903881073,
"learning_rate": 4.860617097070278e-05,
"loss": 0.8152,
"num_input_tokens_seen": 7871168,
"step": 250
},
{
"epoch": 0.10890454836643178,
"grad_norm": 0.1157078966498375,
"learning_rate": 4.855041075231314e-05,
"loss": 0.7163,
"num_input_tokens_seen": 8011264,
"step": 255
},
{
"epoch": 0.11103993166773436,
"grad_norm": 0.06340761482715607,
"learning_rate": 4.8493590221662436e-05,
"loss": 0.73,
"num_input_tokens_seen": 8152320,
"step": 260
},
{
"epoch": 0.11317531496903695,
"grad_norm": 0.15619327127933502,
"learning_rate": 4.843571193698653e-05,
"loss": 0.8089,
"num_input_tokens_seen": 8293312,
"step": 265
},
{
"epoch": 0.11531069827033953,
"grad_norm": 0.07074210792779922,
"learning_rate": 4.837677850414464e-05,
"loss": 0.6812,
"num_input_tokens_seen": 8472896,
"step": 270
},
{
"epoch": 0.11744608157164212,
"grad_norm": 0.07977280020713806,
"learning_rate": 4.8316792576502004e-05,
"loss": 0.7619,
"num_input_tokens_seen": 8643552,
"step": 275
},
{
"epoch": 0.11958146487294469,
"grad_norm": 0.07114008814096451,
"learning_rate": 4.825575685481045e-05,
"loss": 0.7461,
"num_input_tokens_seen": 8804736,
"step": 280
},
{
"epoch": 0.12171684817424727,
"grad_norm": 0.06181742250919342,
"learning_rate": 4.819367408708676e-05,
"loss": 0.8225,
"num_input_tokens_seen": 8951648,
"step": 285
},
{
"epoch": 0.12385223147554986,
"grad_norm": 0.07226210832595825,
"learning_rate": 4.8130547068488954e-05,
"loss": 0.7792,
"num_input_tokens_seen": 9097312,
"step": 290
},
{
"epoch": 0.12598761477685244,
"grad_norm": 0.0596122108399868,
"learning_rate": 4.806637864119049e-05,
"loss": 0.8316,
"num_input_tokens_seen": 9234688,
"step": 295
},
{
"epoch": 0.12812299807815503,
"grad_norm": 0.057102903723716736,
"learning_rate": 4.800117169425223e-05,
"loss": 0.7616,
"num_input_tokens_seen": 9410528,
"step": 300
},
{
"epoch": 0.1302583813794576,
"grad_norm": 0.06964308768510818,
"learning_rate": 4.79349291634924e-05,
"loss": 0.7982,
"num_input_tokens_seen": 9593280,
"step": 305
},
{
"epoch": 0.1323937646807602,
"grad_norm": 0.06177399307489395,
"learning_rate": 4.786765403135444e-05,
"loss": 0.7515,
"num_input_tokens_seen": 9769824,
"step": 310
},
{
"epoch": 0.13452914798206278,
"grad_norm": 0.08495648950338364,
"learning_rate": 4.779934932677265e-05,
"loss": 0.6677,
"num_input_tokens_seen": 9921536,
"step": 315
},
{
"epoch": 0.13666453128336536,
"grad_norm": 0.060477472841739655,
"learning_rate": 4.77300181250359e-05,
"loss": 0.7559,
"num_input_tokens_seen": 10089088,
"step": 320
},
{
"epoch": 0.13879991458466795,
"grad_norm": 0.05845116078853607,
"learning_rate": 4.7659663547649124e-05,
"loss": 0.7337,
"num_input_tokens_seen": 10282272,
"step": 325
},
{
"epoch": 0.14093529788597053,
"grad_norm": 0.06424874067306519,
"learning_rate": 4.758828876219278e-05,
"loss": 0.8009,
"num_input_tokens_seen": 10450848,
"step": 330
},
{
"epoch": 0.14307068118727312,
"grad_norm": 0.06901010870933533,
"learning_rate": 4.751589698218026e-05,
"loss": 0.7203,
"num_input_tokens_seen": 10617664,
"step": 335
},
{
"epoch": 0.1452060644885757,
"grad_norm": 0.07261721789836884,
"learning_rate": 4.744249146691317e-05,
"loss": 0.5286,
"num_input_tokens_seen": 10794880,
"step": 340
},
{
"epoch": 0.1473414477898783,
"grad_norm": 0.07624544203281403,
"learning_rate": 4.736807552133464e-05,
"loss": 0.6662,
"num_input_tokens_seen": 10956960,
"step": 345
},
{
"epoch": 0.14947683109118087,
"grad_norm": 0.06444702297449112,
"learning_rate": 4.729265249588046e-05,
"loss": 0.6554,
"num_input_tokens_seen": 11105440,
"step": 350
},
{
"epoch": 0.15161221439248346,
"grad_norm": 0.08421933650970459,
"learning_rate": 4.721622578632832e-05,
"loss": 0.7981,
"num_input_tokens_seen": 11248448,
"step": 355
},
{
"epoch": 0.15374759769378604,
"grad_norm": 0.060541413724422455,
"learning_rate": 4.71387988336448e-05,
"loss": 0.5976,
"num_input_tokens_seen": 11404928,
"step": 360
},
{
"epoch": 0.15588298099508863,
"grad_norm": 0.07518257945775986,
"learning_rate": 4.706037512383058e-05,
"loss": 0.7783,
"num_input_tokens_seen": 11586880,
"step": 365
},
{
"epoch": 0.1580183642963912,
"grad_norm": 0.051343463361263275,
"learning_rate": 4.6980958187763394e-05,
"loss": 0.6556,
"num_input_tokens_seen": 11746368,
"step": 370
},
{
"epoch": 0.1601537475976938,
"grad_norm": 0.08890614658594131,
"learning_rate": 4.690055160103908e-05,
"loss": 0.5951,
"num_input_tokens_seen": 11876928,
"step": 375
},
{
"epoch": 0.16228913089899638,
"grad_norm": 0.049633271992206573,
"learning_rate": 4.681915898381064e-05,
"loss": 0.7438,
"num_input_tokens_seen": 12070656,
"step": 380
},
{
"epoch": 0.16442451420029897,
"grad_norm": 0.06845410168170929,
"learning_rate": 4.67367840006252e-05,
"loss": 0.861,
"num_input_tokens_seen": 12215104,
"step": 385
},
{
"epoch": 0.16655989750160155,
"grad_norm": 0.09894266724586487,
"learning_rate": 4.6653430360259015e-05,
"loss": 0.597,
"num_input_tokens_seen": 12367616,
"step": 390
},
{
"epoch": 0.16869528080290414,
"grad_norm": 0.07006240636110306,
"learning_rate": 4.656910181555055e-05,
"loss": 0.6786,
"num_input_tokens_seen": 12550368,
"step": 395
},
{
"epoch": 0.1708306641042067,
"grad_norm": 0.08737102895975113,
"learning_rate": 4.648380216323145e-05,
"loss": 0.6539,
"num_input_tokens_seen": 12693248,
"step": 400
},
{
"epoch": 0.17296604740550928,
"grad_norm": 0.08132334798574448,
"learning_rate": 4.639753524375564e-05,
"loss": 0.8733,
"num_input_tokens_seen": 12856832,
"step": 405
},
{
"epoch": 0.17510143070681186,
"grad_norm": 0.061612244695425034,
"learning_rate": 4.631030494112638e-05,
"loss": 0.636,
"num_input_tokens_seen": 13028352,
"step": 410
},
{
"epoch": 0.17723681400811445,
"grad_norm": 0.07655072212219238,
"learning_rate": 4.622211518272144e-05,
"loss": 0.7299,
"num_input_tokens_seen": 13163616,
"step": 415
},
{
"epoch": 0.17937219730941703,
"grad_norm": 0.06312955170869827,
"learning_rate": 4.613296993911623e-05,
"loss": 0.5954,
"num_input_tokens_seen": 13336608,
"step": 420
},
{
"epoch": 0.18150758061071962,
"grad_norm": 0.07038469612598419,
"learning_rate": 4.604287322390509e-05,
"loss": 0.8243,
"num_input_tokens_seen": 13505408,
"step": 425
},
{
"epoch": 0.1836429639120222,
"grad_norm": 0.0707494243979454,
"learning_rate": 4.59518290935205e-05,
"loss": 0.5552,
"num_input_tokens_seen": 13642592,
"step": 430
},
{
"epoch": 0.18577834721332478,
"grad_norm": 0.06867733597755432,
"learning_rate": 4.5859841647050565e-05,
"loss": 0.6857,
"num_input_tokens_seen": 13790976,
"step": 435
},
{
"epoch": 0.18791373051462737,
"grad_norm": 0.06942213326692581,
"learning_rate": 4.576691502605434e-05,
"loss": 0.6743,
"num_input_tokens_seen": 13956224,
"step": 440
},
{
"epoch": 0.19004911381592995,
"grad_norm": 0.06548978388309479,
"learning_rate": 4.5673053414375436e-05,
"loss": 0.579,
"num_input_tokens_seen": 14115296,
"step": 445
},
{
"epoch": 0.19218449711723254,
"grad_norm": 0.07146024703979492,
"learning_rate": 4.557826103795364e-05,
"loss": 0.8965,
"num_input_tokens_seen": 14267168,
"step": 450
},
{
"epoch": 0.19431988041853512,
"grad_norm": 0.10247491300106049,
"learning_rate": 4.548254216463465e-05,
"loss": 0.8137,
"num_input_tokens_seen": 14413312,
"step": 455
},
{
"epoch": 0.1964552637198377,
"grad_norm": 0.08518624305725098,
"learning_rate": 4.538590110397789e-05,
"loss": 0.617,
"num_input_tokens_seen": 14550880,
"step": 460
},
{
"epoch": 0.1985906470211403,
"grad_norm": 0.1248399019241333,
"learning_rate": 4.528834220706253e-05,
"loss": 0.9175,
"num_input_tokens_seen": 14691712,
"step": 465
},
{
"epoch": 0.20072603032244288,
"grad_norm": 0.06742729991674423,
"learning_rate": 4.518986986629157e-05,
"loss": 0.7633,
"num_input_tokens_seen": 14861408,
"step": 470
},
{
"epoch": 0.20286141362374546,
"grad_norm": 0.09116410464048386,
"learning_rate": 4.509048851519404e-05,
"loss": 0.6935,
"num_input_tokens_seen": 15003328,
"step": 475
},
{
"epoch": 0.20499679692504805,
"grad_norm": 0.08975204825401306,
"learning_rate": 4.499020262822547e-05,
"loss": 0.6322,
"num_input_tokens_seen": 15125792,
"step": 480
},
{
"epoch": 0.20713218022635063,
"grad_norm": 0.10138271003961563,
"learning_rate": 4.4889016720566355e-05,
"loss": 0.9118,
"num_input_tokens_seen": 15301856,
"step": 485
},
{
"epoch": 0.20926756352765322,
"grad_norm": 0.07376892864704132,
"learning_rate": 4.478693534791893e-05,
"loss": 0.6331,
"num_input_tokens_seen": 15487488,
"step": 490
},
{
"epoch": 0.2114029468289558,
"grad_norm": 0.07480096817016602,
"learning_rate": 4.4683963106302e-05,
"loss": 0.7326,
"num_input_tokens_seen": 15657312,
"step": 495
},
{
"epoch": 0.21353833013025839,
"grad_norm": 0.06383755058050156,
"learning_rate": 4.458010463184405e-05,
"loss": 0.6806,
"num_input_tokens_seen": 15850912,
"step": 500
},
{
"epoch": 0.21567371343156097,
"grad_norm": 0.06868927925825119,
"learning_rate": 4.4475364600574535e-05,
"loss": 0.7017,
"num_input_tokens_seen": 15986400,
"step": 505
},
{
"epoch": 0.21780909673286356,
"grad_norm": 0.09151501208543777,
"learning_rate": 4.43697477282133e-05,
"loss": 0.6438,
"num_input_tokens_seen": 16144960,
"step": 510
},
{
"epoch": 0.21994448003416614,
"grad_norm": 0.09519924968481064,
"learning_rate": 4.4263258769958274e-05,
"loss": 0.757,
"num_input_tokens_seen": 16289856,
"step": 515
},
{
"epoch": 0.22207986333546872,
"grad_norm": 0.09690000116825104,
"learning_rate": 4.415590252027141e-05,
"loss": 0.6478,
"num_input_tokens_seen": 16439328,
"step": 520
},
{
"epoch": 0.2242152466367713,
"grad_norm": 0.06739991158246994,
"learning_rate": 4.404768381266279e-05,
"loss": 0.7572,
"num_input_tokens_seen": 16575552,
"step": 525
},
{
"epoch": 0.2263506299380739,
"grad_norm": 0.08569491654634476,
"learning_rate": 4.393860751947302e-05,
"loss": 0.7073,
"num_input_tokens_seen": 16754016,
"step": 530
},
{
"epoch": 0.22848601323937648,
"grad_norm": 0.07734335213899612,
"learning_rate": 4.382867855165386e-05,
"loss": 0.6275,
"num_input_tokens_seen": 16897248,
"step": 535
},
{
"epoch": 0.23062139654067906,
"grad_norm": 0.10210688412189484,
"learning_rate": 4.371790185854709e-05,
"loss": 0.6937,
"num_input_tokens_seen": 17077792,
"step": 540
},
{
"epoch": 0.23275677984198165,
"grad_norm": 0.08407072722911835,
"learning_rate": 4.360628242766175e-05,
"loss": 0.7242,
"num_input_tokens_seen": 17232480,
"step": 545
},
{
"epoch": 0.23489216314328423,
"grad_norm": 0.07300622761249542,
"learning_rate": 4.3493825284449515e-05,
"loss": 0.6462,
"num_input_tokens_seen": 17371008,
"step": 550
},
{
"epoch": 0.2370275464445868,
"grad_norm": 0.0677730068564415,
"learning_rate": 4.338053549207844e-05,
"loss": 0.6891,
"num_input_tokens_seen": 17502016,
"step": 555
},
{
"epoch": 0.23916292974588937,
"grad_norm": 0.07456525415182114,
"learning_rate": 4.326641815120505e-05,
"loss": 0.6293,
"num_input_tokens_seen": 17661632,
"step": 560
},
{
"epoch": 0.24129831304719196,
"grad_norm": 0.0730578750371933,
"learning_rate": 4.315147839974464e-05,
"loss": 0.7189,
"num_input_tokens_seen": 17781440,
"step": 565
},
{
"epoch": 0.24343369634849454,
"grad_norm": 0.10547315329313278,
"learning_rate": 4.303572141263997e-05,
"loss": 0.6933,
"num_input_tokens_seen": 17983840,
"step": 570
},
{
"epoch": 0.24556907964979713,
"grad_norm": 0.08231621235609055,
"learning_rate": 4.2919152401628284e-05,
"loss": 0.6973,
"num_input_tokens_seen": 18166592,
"step": 575
},
{
"epoch": 0.2477044629510997,
"grad_norm": 0.0755239874124527,
"learning_rate": 4.2801776615006644e-05,
"loss": 0.5742,
"num_input_tokens_seen": 18302912,
"step": 580
},
{
"epoch": 0.2498398462524023,
"grad_norm": 0.0680757462978363,
"learning_rate": 4.2683599337395655e-05,
"loss": 0.6087,
"num_input_tokens_seen": 18469344,
"step": 585
},
{
"epoch": 0.2519752295537049,
"grad_norm": 0.07045536488294601,
"learning_rate": 4.2564625889501496e-05,
"loss": 0.6595,
"num_input_tokens_seen": 18599104,
"step": 590
},
{
"epoch": 0.25411061285500747,
"grad_norm": 0.09744574129581451,
"learning_rate": 4.2444861627876444e-05,
"loss": 0.7353,
"num_input_tokens_seen": 18785696,
"step": 595
},
{
"epoch": 0.25624599615631005,
"grad_norm": 0.07259754836559296,
"learning_rate": 4.2324311944677585e-05,
"loss": 0.8322,
"num_input_tokens_seen": 18972224,
"step": 600
},
{
"epoch": 0.25838137945761264,
"grad_norm": 0.08454828709363937,
"learning_rate": 4.220298226742415e-05,
"loss": 0.6534,
"num_input_tokens_seen": 19107968,
"step": 605
},
{
"epoch": 0.2605167627589152,
"grad_norm": 0.07386191189289093,
"learning_rate": 4.208087805875314e-05,
"loss": 0.7441,
"num_input_tokens_seen": 19295072,
"step": 610
},
{
"epoch": 0.2626521460602178,
"grad_norm": 0.07102880626916885,
"learning_rate": 4.195800481617328e-05,
"loss": 0.816,
"num_input_tokens_seen": 19440384,
"step": 615
},
{
"epoch": 0.2647875293615204,
"grad_norm": 0.07378337532281876,
"learning_rate": 4.183436807181765e-05,
"loss": 0.7341,
"num_input_tokens_seen": 19619680,
"step": 620
},
{
"epoch": 0.266922912662823,
"grad_norm": 0.09023339301347733,
"learning_rate": 4.17099733921945e-05,
"loss": 0.6835,
"num_input_tokens_seen": 19759200,
"step": 625
},
{
"epoch": 0.26905829596412556,
"grad_norm": 0.07276886701583862,
"learning_rate": 4.158482637793667e-05,
"loss": 0.7359,
"num_input_tokens_seen": 19924448,
"step": 630
},
{
"epoch": 0.27119367926542814,
"grad_norm": 0.09461617469787598,
"learning_rate": 4.145893266354944e-05,
"loss": 0.6531,
"num_input_tokens_seen": 20077888,
"step": 635
},
{
"epoch": 0.27332906256673073,
"grad_norm": 0.10658084601163864,
"learning_rate": 4.133229791715685e-05,
"loss": 0.5728,
"num_input_tokens_seen": 20223296,
"step": 640
},
{
"epoch": 0.2754644458680333,
"grad_norm": 0.07908082008361816,
"learning_rate": 4.1204927840246455e-05,
"loss": 0.7355,
"num_input_tokens_seen": 20352928,
"step": 645
},
{
"epoch": 0.2775998291693359,
"grad_norm": 0.07296961545944214,
"learning_rate": 4.1076828167412683e-05,
"loss": 0.6645,
"num_input_tokens_seen": 20511232,
"step": 650
},
{
"epoch": 0.2797352124706385,
"grad_norm": 0.0914238691329956,
"learning_rate": 4.0948004666098625e-05,
"loss": 0.5866,
"num_input_tokens_seen": 20684032,
"step": 655
},
{
"epoch": 0.28187059577194107,
"grad_norm": 0.08596916496753693,
"learning_rate": 4.081846313633637e-05,
"loss": 0.6235,
"num_input_tokens_seen": 20826176,
"step": 660
},
{
"epoch": 0.28400597907324365,
"grad_norm": 0.08031884580850601,
"learning_rate": 4.068820941048587e-05,
"loss": 0.6974,
"num_input_tokens_seen": 21007264,
"step": 665
},
{
"epoch": 0.28614136237454624,
"grad_norm": 0.09857963025569916,
"learning_rate": 4.0557249352972316e-05,
"loss": 0.6629,
"num_input_tokens_seen": 21145024,
"step": 670
},
{
"epoch": 0.2882767456758488,
"grad_norm": 0.07686656713485718,
"learning_rate": 4.0425588860022166e-05,
"loss": 0.7321,
"num_input_tokens_seen": 21295104,
"step": 675
},
{
"epoch": 0.2904121289771514,
"grad_norm": 0.0786074548959732,
"learning_rate": 4.029323385939763e-05,
"loss": 0.6325,
"num_input_tokens_seen": 21440256,
"step": 680
},
{
"epoch": 0.292547512278454,
"grad_norm": 0.0865710899233818,
"learning_rate": 4.0160190310129806e-05,
"loss": 0.6882,
"num_input_tokens_seen": 21592768,
"step": 685
},
{
"epoch": 0.2946828955797566,
"grad_norm": 0.0743151381611824,
"learning_rate": 4.0026464202250375e-05,
"loss": 0.659,
"num_input_tokens_seen": 21763360,
"step": 690
},
{
"epoch": 0.29681827888105916,
"grad_norm": 0.060234300792217255,
"learning_rate": 3.989206155652192e-05,
"loss": 0.6757,
"num_input_tokens_seen": 21917792,
"step": 695
},
{
"epoch": 0.29895366218236175,
"grad_norm": 0.09439852088689804,
"learning_rate": 3.975698842416684e-05,
"loss": 0.6238,
"num_input_tokens_seen": 22052384,
"step": 700
},
{
"epoch": 0.30108904548366433,
"grad_norm": 0.07870359718799591,
"learning_rate": 3.962125088659492e-05,
"loss": 0.688,
"num_input_tokens_seen": 22225568,
"step": 705
},
{
"epoch": 0.3032244287849669,
"grad_norm": 0.08472148329019547,
"learning_rate": 3.948485505512953e-05,
"loss": 0.7123,
"num_input_tokens_seen": 22388160,
"step": 710
},
{
"epoch": 0.3053598120862695,
"grad_norm": 0.07081770896911621,
"learning_rate": 3.9347807070732444e-05,
"loss": 0.6638,
"num_input_tokens_seen": 22579936,
"step": 715
},
{
"epoch": 0.3074951953875721,
"grad_norm": 0.07737255096435547,
"learning_rate": 3.921011310372739e-05,
"loss": 0.7064,
"num_input_tokens_seen": 22730048,
"step": 720
},
{
"epoch": 0.30963057868887467,
"grad_norm": 0.0714409351348877,
"learning_rate": 3.907177935352223e-05,
"loss": 0.5651,
"num_input_tokens_seen": 22911168,
"step": 725
},
{
"epoch": 0.31176596199017725,
"grad_norm": 0.06933268904685974,
"learning_rate": 3.893281204832984e-05,
"loss": 0.6695,
"num_input_tokens_seen": 23088096,
"step": 730
},
{
"epoch": 0.31390134529147984,
"grad_norm": 0.10002848505973816,
"learning_rate": 3.87932174448877e-05,
"loss": 0.5989,
"num_input_tokens_seen": 23243616,
"step": 735
},
{
"epoch": 0.3160367285927824,
"grad_norm": 0.07605909556150436,
"learning_rate": 3.8653001828176185e-05,
"loss": 0.5707,
"num_input_tokens_seen": 23402240,
"step": 740
},
{
"epoch": 0.318172111894085,
"grad_norm": 0.09124422818422318,
"learning_rate": 3.8512171511135616e-05,
"loss": 0.6727,
"num_input_tokens_seen": 23568096,
"step": 745
},
{
"epoch": 0.3203074951953876,
"grad_norm": 0.08254604786634445,
"learning_rate": 3.8370732834382025e-05,
"loss": 0.7122,
"num_input_tokens_seen": 23723968,
"step": 750
},
{
"epoch": 0.3224428784966902,
"grad_norm": 0.07256225496530533,
"learning_rate": 3.822869216592167e-05,
"loss": 0.6667,
"num_input_tokens_seen": 23882016,
"step": 755
},
{
"epoch": 0.32457826179799276,
"grad_norm": 0.07740245759487152,
"learning_rate": 3.8086055900864356e-05,
"loss": 0.7896,
"num_input_tokens_seen": 24037088,
"step": 760
},
{
"epoch": 0.32671364509929535,
"grad_norm": 0.0661853477358818,
"learning_rate": 3.794283046113546e-05,
"loss": 0.6208,
"num_input_tokens_seen": 24180032,
"step": 765
},
{
"epoch": 0.32884902840059793,
"grad_norm": 0.1970444917678833,
"learning_rate": 3.7799022295186823e-05,
"loss": 0.6193,
"num_input_tokens_seen": 24363168,
"step": 770
},
{
"epoch": 0.3309844117019005,
"grad_norm": 0.0673363208770752,
"learning_rate": 3.765463787770645e-05,
"loss": 0.6024,
"num_input_tokens_seen": 24522112,
"step": 775
},
{
"epoch": 0.3331197950032031,
"grad_norm": 0.11159452795982361,
"learning_rate": 3.750968370932694e-05,
"loss": 0.7026,
"num_input_tokens_seen": 24694048,
"step": 780
},
{
"epoch": 0.3352551783045057,
"grad_norm": 0.0691002830862999,
"learning_rate": 3.736416631633286e-05,
"loss": 0.6094,
"num_input_tokens_seen": 24847616,
"step": 785
},
{
"epoch": 0.33739056160580827,
"grad_norm": 0.10651443898677826,
"learning_rate": 3.721809225036688e-05,
"loss": 0.6167,
"num_input_tokens_seen": 24992096,
"step": 790
},
{
"epoch": 0.3395259449071108,
"grad_norm": 0.1445714682340622,
"learning_rate": 3.7071468088134806e-05,
"loss": 0.6861,
"num_input_tokens_seen": 25145792,
"step": 795
},
{
"epoch": 0.3416613282084134,
"grad_norm": 0.10462247580289841,
"learning_rate": 3.692430043110947e-05,
"loss": 0.8109,
"num_input_tokens_seen": 25316896,
"step": 800
},
{
"epoch": 0.34379671150971597,
"grad_norm": 0.09085245430469513,
"learning_rate": 3.677659590523354e-05,
"loss": 0.6796,
"num_input_tokens_seen": 25452608,
"step": 805
},
{
"epoch": 0.34593209481101855,
"grad_norm": 0.07898429781198502,
"learning_rate": 3.662836116062117e-05,
"loss": 0.8018,
"num_input_tokens_seen": 25597056,
"step": 810
},
{
"epoch": 0.34806747811232114,
"grad_norm": 0.07476533204317093,
"learning_rate": 3.647960287125859e-05,
"loss": 0.7318,
"num_input_tokens_seen": 25764224,
"step": 815
},
{
"epoch": 0.3502028614136237,
"grad_norm": 0.09074775129556656,
"learning_rate": 3.6330327734703626e-05,
"loss": 0.6615,
"num_input_tokens_seen": 25893824,
"step": 820
},
{
"epoch": 0.3523382447149263,
"grad_norm": 0.07897800952196121,
"learning_rate": 3.61805424717842e-05,
"loss": 0.6466,
"num_input_tokens_seen": 26034304,
"step": 825
},
{
"epoch": 0.3544736280162289,
"grad_norm": 0.07650279998779297,
"learning_rate": 3.603025382629565e-05,
"loss": 0.7432,
"num_input_tokens_seen": 26187712,
"step": 830
},
{
"epoch": 0.3566090113175315,
"grad_norm": 0.09015358239412308,
"learning_rate": 3.58794685646972e-05,
"loss": 0.6275,
"num_input_tokens_seen": 26338080,
"step": 835
},
{
"epoch": 0.35874439461883406,
"grad_norm": 0.08263330161571503,
"learning_rate": 3.572819347580722e-05,
"loss": 0.6545,
"num_input_tokens_seen": 26501440,
"step": 840
},
{
"epoch": 0.36087977792013665,
"grad_norm": 0.06605567783117294,
"learning_rate": 3.5576435370497655e-05,
"loss": 0.6806,
"num_input_tokens_seen": 26663936,
"step": 845
},
{
"epoch": 0.36301516122143923,
"grad_norm": 0.08297718316316605,
"learning_rate": 3.542420108138732e-05,
"loss": 0.6517,
"num_input_tokens_seen": 26834176,
"step": 850
},
{
"epoch": 0.3651505445227418,
"grad_norm": 0.14128510653972626,
"learning_rate": 3.527149746253431e-05,
"loss": 0.7356,
"num_input_tokens_seen": 26996928,
"step": 855
},
{
"epoch": 0.3672859278240444,
"grad_norm": 0.09986595809459686,
"learning_rate": 3.511833138912738e-05,
"loss": 0.8021,
"num_input_tokens_seen": 27162304,
"step": 860
},
{
"epoch": 0.369421311125347,
"grad_norm": 0.0766059085726738,
"learning_rate": 3.496470975717643e-05,
"loss": 0.7542,
"num_input_tokens_seen": 27319392,
"step": 865
},
{
"epoch": 0.37155669442664957,
"grad_norm": 0.07614283263683319,
"learning_rate": 3.4810639483202015e-05,
"loss": 0.6407,
"num_input_tokens_seen": 27511360,
"step": 870
},
{
"epoch": 0.37369207772795215,
"grad_norm": 0.06801874190568924,
"learning_rate": 3.465612750392393e-05,
"loss": 0.7553,
"num_input_tokens_seen": 27703488,
"step": 875
},
{
"epoch": 0.37582746102925474,
"grad_norm": 0.09346262365579605,
"learning_rate": 3.450118077594891e-05,
"loss": 0.6873,
"num_input_tokens_seen": 27866880,
"step": 880
},
{
"epoch": 0.3779628443305573,
"grad_norm": 0.07162796705961227,
"learning_rate": 3.434580627545743e-05,
"loss": 0.6827,
"num_input_tokens_seen": 28052480,
"step": 885
},
{
"epoch": 0.3800982276318599,
"grad_norm": 0.09126775711774826,
"learning_rate": 3.419001099788959e-05,
"loss": 0.6143,
"num_input_tokens_seen": 28229600,
"step": 890
},
{
"epoch": 0.3822336109331625,
"grad_norm": 0.07642071694135666,
"learning_rate": 3.403380195763018e-05,
"loss": 0.5969,
"num_input_tokens_seen": 28392992,
"step": 895
},
{
"epoch": 0.3843689942344651,
"grad_norm": 0.08265725523233414,
"learning_rate": 3.387718618769287e-05,
"loss": 0.4596,
"num_input_tokens_seen": 28569344,
"step": 900
},
{
"epoch": 0.38650437753576766,
"grad_norm": 0.08538588136434555,
"learning_rate": 3.372017073940355e-05,
"loss": 0.6412,
"num_input_tokens_seen": 28732608,
"step": 905
},
{
"epoch": 0.38863976083707025,
"grad_norm": 0.08859037607908249,
"learning_rate": 3.356276268208289e-05,
"loss": 0.7309,
"num_input_tokens_seen": 28885792,
"step": 910
},
{
"epoch": 0.39077514413837283,
"grad_norm": 0.08217044919729233,
"learning_rate": 3.340496910272798e-05,
"loss": 0.5964,
"num_input_tokens_seen": 29023008,
"step": 915
},
{
"epoch": 0.3929105274396754,
"grad_norm": 0.0807810053229332,
"learning_rate": 3.324679710569334e-05,
"loss": 0.6368,
"num_input_tokens_seen": 29167584,
"step": 920
},
{
"epoch": 0.395045910740978,
"grad_norm": 0.07328809797763824,
"learning_rate": 3.308825381237103e-05,
"loss": 0.626,
"num_input_tokens_seen": 29322720,
"step": 925
},
{
"epoch": 0.3971812940422806,
"grad_norm": 0.09321283549070358,
"learning_rate": 3.292934636086998e-05,
"loss": 0.8989,
"num_input_tokens_seen": 29487200,
"step": 930
},
{
"epoch": 0.39931667734358317,
"grad_norm": 0.08408747613430023,
"learning_rate": 3.2770081905694696e-05,
"loss": 0.7116,
"num_input_tokens_seen": 29651232,
"step": 935
},
{
"epoch": 0.40145206064488576,
"grad_norm": 0.09342040121555328,
"learning_rate": 3.261046761742305e-05,
"loss": 0.7665,
"num_input_tokens_seen": 29805216,
"step": 940
},
{
"epoch": 0.40358744394618834,
"grad_norm": 0.07169543951749802,
"learning_rate": 3.245051068238348e-05,
"loss": 0.6187,
"num_input_tokens_seen": 29967360,
"step": 945
},
{
"epoch": 0.4057228272474909,
"grad_norm": 0.09437992423772812,
"learning_rate": 3.229021830233149e-05,
"loss": 0.7386,
"num_input_tokens_seen": 30123104,
"step": 950
},
{
"epoch": 0.4078582105487935,
"grad_norm": 0.10910359770059586,
"learning_rate": 3.2129597694125296e-05,
"loss": 0.7952,
"num_input_tokens_seen": 30302240,
"step": 955
},
{
"epoch": 0.4099935938500961,
"grad_norm": 0.10328692942857742,
"learning_rate": 3.1968656089401e-05,
"loss": 0.6779,
"num_input_tokens_seen": 30445184,
"step": 960
},
{
"epoch": 0.4121289771513987,
"grad_norm": 0.06910215318202972,
"learning_rate": 3.180740073424693e-05,
"loss": 0.5771,
"num_input_tokens_seen": 30596384,
"step": 965
},
{
"epoch": 0.41426436045270126,
"grad_norm": 0.0915064588189125,
"learning_rate": 3.164583888887746e-05,
"loss": 0.6306,
"num_input_tokens_seen": 30778592,
"step": 970
},
{
"epoch": 0.41639974375400385,
"grad_norm": 0.08599945902824402,
"learning_rate": 3.1483977827306054e-05,
"loss": 0.693,
"num_input_tokens_seen": 30943360,
"step": 975
},
{
"epoch": 0.41853512705530643,
"grad_norm": 0.08215602487325668,
"learning_rate": 3.1321824837017875e-05,
"loss": 0.5558,
"num_input_tokens_seen": 31062304,
"step": 980
},
{
"epoch": 0.420670510356609,
"grad_norm": 0.11376603692770004,
"learning_rate": 3.1159387218641575e-05,
"loss": 0.7323,
"num_input_tokens_seen": 31233792,
"step": 985
},
{
"epoch": 0.4228058936579116,
"grad_norm": 0.08716494590044022,
"learning_rate": 3.099667228562064e-05,
"loss": 0.6371,
"num_input_tokens_seen": 31383616,
"step": 990
},
{
"epoch": 0.4249412769592142,
"grad_norm": 0.05800577253103256,
"learning_rate": 3.083368736388414e-05,
"loss": 0.6631,
"num_input_tokens_seen": 31559968,
"step": 995
},
{
"epoch": 0.42707666026051677,
"grad_norm": 0.08554735034704208,
"learning_rate": 3.067043979151687e-05,
"loss": 0.6021,
"num_input_tokens_seen": 31716480,
"step": 1000
},
{
"epoch": 0.42921204356181936,
"grad_norm": 0.08346949517726898,
"learning_rate": 3.0506936918428947e-05,
"loss": 0.5901,
"num_input_tokens_seen": 31861568,
"step": 1005
},
{
"epoch": 0.43134742686312194,
"grad_norm": 0.16743424534797668,
"learning_rate": 3.0343186106024946e-05,
"loss": 0.5969,
"num_input_tokens_seen": 32023008,
"step": 1010
},
{
"epoch": 0.4334828101644245,
"grad_norm": 0.08071965724229813,
"learning_rate": 3.01791947268724e-05,
"loss": 0.6469,
"num_input_tokens_seen": 32213024,
"step": 1015
},
{
"epoch": 0.4356181934657271,
"grad_norm": 0.11266499757766724,
"learning_rate": 3.0014970164369936e-05,
"loss": 0.6257,
"num_input_tokens_seen": 32382752,
"step": 1020
},
{
"epoch": 0.4377535767670297,
"grad_norm": 0.09486319869756699,
"learning_rate": 2.985051981241479e-05,
"loss": 0.7496,
"num_input_tokens_seen": 32520832,
"step": 1025
},
{
"epoch": 0.4398889600683323,
"grad_norm": 0.1076025515794754,
"learning_rate": 2.9685851075069954e-05,
"loss": 0.8778,
"num_input_tokens_seen": 32673472,
"step": 1030
},
{
"epoch": 0.44202434336963486,
"grad_norm": 0.12652435898780823,
"learning_rate": 2.9520971366230783e-05,
"loss": 0.7424,
"num_input_tokens_seen": 32850272,
"step": 1035
},
{
"epoch": 0.44415972667093745,
"grad_norm": 0.11113929003477097,
"learning_rate": 2.9355888109291247e-05,
"loss": 0.8948,
"num_input_tokens_seen": 32994432,
"step": 1040
},
{
"epoch": 0.44629510997224003,
"grad_norm": 0.07004854828119278,
"learning_rate": 2.9190608736809664e-05,
"loss": 0.6752,
"num_input_tokens_seen": 33134112,
"step": 1045
},
{
"epoch": 0.4484304932735426,
"grad_norm": 0.10912331193685532,
"learning_rate": 2.902514069017409e-05,
"loss": 0.8079,
"num_input_tokens_seen": 33307008,
"step": 1050
},
{
"epoch": 0.4505658765748452,
"grad_norm": 0.08094992488622665,
"learning_rate": 2.8859491419267264e-05,
"loss": 0.6908,
"num_input_tokens_seen": 33478752,
"step": 1055
},
{
"epoch": 0.4527012598761478,
"grad_norm": 0.09789257496595383,
"learning_rate": 2.86936683821312e-05,
"loss": 0.6369,
"num_input_tokens_seen": 33641728,
"step": 1060
},
{
"epoch": 0.4548366431774504,
"grad_norm": 0.07772962003946304,
"learning_rate": 2.8527679044631417e-05,
"loss": 0.6272,
"num_input_tokens_seen": 33819104,
"step": 1065
},
{
"epoch": 0.45697202647875296,
"grad_norm": 0.07876738905906677,
"learning_rate": 2.836153088012078e-05,
"loss": 0.5017,
"num_input_tokens_seen": 33946336,
"step": 1070
},
{
"epoch": 0.45910740978005554,
"grad_norm": 0.07158119231462479,
"learning_rate": 2.8195231369103042e-05,
"loss": 0.5854,
"num_input_tokens_seen": 34111232,
"step": 1075
},
{
"epoch": 0.4612427930813581,
"grad_norm": 0.07409899681806564,
"learning_rate": 2.802878799889605e-05,
"loss": 0.5877,
"num_input_tokens_seen": 34269536,
"step": 1080
},
{
"epoch": 0.4633781763826607,
"grad_norm": 0.16344216465950012,
"learning_rate": 2.786220826329462e-05,
"loss": 0.7302,
"num_input_tokens_seen": 34420224,
"step": 1085
},
{
"epoch": 0.4655135596839633,
"grad_norm": 0.09065761417150497,
"learning_rate": 2.7695499662233164e-05,
"loss": 0.9365,
"num_input_tokens_seen": 34559872,
"step": 1090
},
{
"epoch": 0.4676489429852659,
"grad_norm": 0.07718425989151001,
"learning_rate": 2.752866970144803e-05,
"loss": 0.6596,
"num_input_tokens_seen": 34734400,
"step": 1095
},
{
"epoch": 0.46978432628656847,
"grad_norm": 0.08346325904130936,
"learning_rate": 2.7361725892139533e-05,
"loss": 0.7114,
"num_input_tokens_seen": 34888416,
"step": 1100
},
{
"epoch": 0.47191970958787105,
"grad_norm": 0.08522050827741623,
"learning_rate": 2.719467575063382e-05,
"loss": 0.5746,
"num_input_tokens_seen": 35020992,
"step": 1105
},
{
"epoch": 0.4740550928891736,
"grad_norm": 0.09076400846242905,
"learning_rate": 2.7027526798044427e-05,
"loss": 0.7177,
"num_input_tokens_seen": 35215072,
"step": 1110
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.06955017149448395,
"learning_rate": 2.6860286559933684e-05,
"loss": 0.6877,
"num_input_tokens_seen": 35380928,
"step": 1115
},
{
"epoch": 0.47832585949177875,
"grad_norm": 0.08468913286924362,
"learning_rate": 2.6692962565973866e-05,
"loss": 0.6099,
"num_input_tokens_seen": 35540480,
"step": 1120
},
{
"epoch": 0.48046124279308133,
"grad_norm": 0.08094287663698196,
"learning_rate": 2.652556234960821e-05,
"loss": 0.5757,
"num_input_tokens_seen": 35704256,
"step": 1125
},
{
"epoch": 0.4825966260943839,
"grad_norm": 0.09746932238340378,
"learning_rate": 2.635809344771169e-05,
"loss": 0.683,
"num_input_tokens_seen": 35856608,
"step": 1130
},
{
"epoch": 0.4847320093956865,
"grad_norm": 0.08693865686655045,
"learning_rate": 2.619056340025175e-05,
"loss": 0.6502,
"num_input_tokens_seen": 35999840,
"step": 1135
},
{
"epoch": 0.4868673926969891,
"grad_norm": 0.09562770277261734,
"learning_rate": 2.6022979749948783e-05,
"loss": 0.6337,
"num_input_tokens_seen": 36129152,
"step": 1140
},
{
"epoch": 0.48900277599829167,
"grad_norm": 0.11900558322668076,
"learning_rate": 2.5855350041936537e-05,
"loss": 0.7166,
"num_input_tokens_seen": 36293152,
"step": 1145
},
{
"epoch": 0.49113815929959426,
"grad_norm": 0.08834047615528107,
"learning_rate": 2.5687681823422445e-05,
"loss": 0.7633,
"num_input_tokens_seen": 36445696,
"step": 1150
},
{
"epoch": 0.49327354260089684,
"grad_norm": 0.07423476129770279,
"learning_rate": 2.551998264334777e-05,
"loss": 0.6183,
"num_input_tokens_seen": 36614528,
"step": 1155
},
{
"epoch": 0.4954089259021994,
"grad_norm": 0.08447694778442383,
"learning_rate": 2.5352260052047788e-05,
"loss": 0.5267,
"num_input_tokens_seen": 36754880,
"step": 1160
},
{
"epoch": 0.497544309203502,
"grad_norm": 0.09028150141239166,
"learning_rate": 2.518452160091181e-05,
"loss": 0.684,
"num_input_tokens_seen": 36932000,
"step": 1165
},
{
"epoch": 0.4996796925048046,
"grad_norm": 0.10303398221731186,
"learning_rate": 2.5016774842043194e-05,
"loss": 0.7886,
"num_input_tokens_seen": 37093504,
"step": 1170
},
{
"epoch": 0.5018150758061072,
"grad_norm": 0.08447935432195663,
"learning_rate": 2.484902732791936e-05,
"loss": 0.691,
"num_input_tokens_seen": 37272736,
"step": 1175
},
{
"epoch": 0.5039504591074098,
"grad_norm": 0.08549617975950241,
"learning_rate": 2.4681286611051708e-05,
"loss": 0.7877,
"num_input_tokens_seen": 37425024,
"step": 1180
},
{
"epoch": 0.5060858424087123,
"grad_norm": 0.08385903388261795,
"learning_rate": 2.4513560243645635e-05,
"loss": 0.6496,
"num_input_tokens_seen": 37600736,
"step": 1185
},
{
"epoch": 0.5082212257100149,
"grad_norm": 0.08815981447696686,
"learning_rate": 2.4345855777260462e-05,
"loss": 0.6722,
"num_input_tokens_seen": 37775072,
"step": 1190
},
{
"epoch": 0.5103566090113175,
"grad_norm": 0.12655802071094513,
"learning_rate": 2.4178180762469447e-05,
"loss": 0.6637,
"num_input_tokens_seen": 37908864,
"step": 1195
},
{
"epoch": 0.5124919923126201,
"grad_norm": 0.09083867073059082,
"learning_rate": 2.4010542748519863e-05,
"loss": 0.6507,
"num_input_tokens_seen": 38099328,
"step": 1200
},
{
"epoch": 0.5146273756139227,
"grad_norm": 0.11199730634689331,
"learning_rate": 2.384294928299309e-05,
"loss": 0.8343,
"num_input_tokens_seen": 38247072,
"step": 1205
},
{
"epoch": 0.5167627589152253,
"grad_norm": 0.08594491332769394,
"learning_rate": 2.3675407911464788e-05,
"loss": 0.598,
"num_input_tokens_seen": 38391168,
"step": 1210
},
{
"epoch": 0.5188981422165279,
"grad_norm": 0.10429448634386063,
"learning_rate": 2.350792617716521e-05,
"loss": 0.6245,
"num_input_tokens_seen": 38573664,
"step": 1215
},
{
"epoch": 0.5210335255178304,
"grad_norm": 0.11104902625083923,
"learning_rate": 2.334051162063953e-05,
"loss": 0.72,
"num_input_tokens_seen": 38740672,
"step": 1220
},
{
"epoch": 0.523168908819133,
"grad_norm": 0.10164003819227219,
"learning_rate": 2.3173171779408386e-05,
"loss": 0.6333,
"num_input_tokens_seen": 38864224,
"step": 1225
},
{
"epoch": 0.5253042921204356,
"grad_norm": 0.10649612545967102,
"learning_rate": 2.3005914187628492e-05,
"loss": 0.7262,
"num_input_tokens_seen": 39000320,
"step": 1230
},
{
"epoch": 0.5274396754217382,
"grad_norm": 0.10383658111095428,
"learning_rate": 2.2838746375753456e-05,
"loss": 0.5828,
"num_input_tokens_seen": 39198400,
"step": 1235
},
{
"epoch": 0.5295750587230408,
"grad_norm": 0.10013597458600998,
"learning_rate": 2.2671675870194677e-05,
"loss": 0.6544,
"num_input_tokens_seen": 39359232,
"step": 1240
},
{
"epoch": 0.5317104420243434,
"grad_norm": 0.13857851922512054,
"learning_rate": 2.2504710192982575e-05,
"loss": 0.6669,
"num_input_tokens_seen": 39502176,
"step": 1245
},
{
"epoch": 0.533845825325646,
"grad_norm": 0.08885691314935684,
"learning_rate": 2.2337856861427843e-05,
"loss": 0.8427,
"num_input_tokens_seen": 39717472,
"step": 1250
},
{
"epoch": 0.5359812086269485,
"grad_norm": 0.11478804051876068,
"learning_rate": 2.2171123387783028e-05,
"loss": 0.5687,
"num_input_tokens_seen": 39836000,
"step": 1255
},
{
"epoch": 0.5381165919282511,
"grad_norm": 0.1051030158996582,
"learning_rate": 2.2004517278904316e-05,
"loss": 0.6957,
"num_input_tokens_seen": 39995200,
"step": 1260
},
{
"epoch": 0.5402519752295537,
"grad_norm": 0.07015421241521835,
"learning_rate": 2.183804603591352e-05,
"loss": 0.6944,
"num_input_tokens_seen": 40173280,
"step": 1265
},
{
"epoch": 0.5423873585308563,
"grad_norm": 0.10149814933538437,
"learning_rate": 2.1671717153860385e-05,
"loss": 0.7211,
"num_input_tokens_seen": 40315296,
"step": 1270
},
{
"epoch": 0.5445227418321589,
"grad_norm": 0.09945672750473022,
"learning_rate": 2.1505538121385127e-05,
"loss": 0.6752,
"num_input_tokens_seen": 40485504,
"step": 1275
},
{
"epoch": 0.5466581251334615,
"grad_norm": 0.07678119838237762,
"learning_rate": 2.133951642038127e-05,
"loss": 0.7874,
"num_input_tokens_seen": 40678624,
"step": 1280
},
{
"epoch": 0.548793508434764,
"grad_norm": 0.11939999461174011,
"learning_rate": 2.117365952565879e-05,
"loss": 0.6918,
"num_input_tokens_seen": 40829472,
"step": 1285
},
{
"epoch": 0.5509288917360666,
"grad_norm": 0.09344258159399033,
"learning_rate": 2.100797490460756e-05,
"loss": 0.6707,
"num_input_tokens_seen": 40954304,
"step": 1290
},
{
"epoch": 0.5530642750373692,
"grad_norm": 0.10135383903980255,
"learning_rate": 2.0842470016861184e-05,
"loss": 0.6515,
"num_input_tokens_seen": 41120160,
"step": 1295
},
{
"epoch": 0.5551996583386718,
"grad_norm": 0.12063171714544296,
"learning_rate": 2.06771523139611e-05,
"loss": 0.7781,
"num_input_tokens_seen": 41283680,
"step": 1300
},
{
"epoch": 0.5573350416399744,
"grad_norm": 0.09838173538446426,
"learning_rate": 2.051202923902112e-05,
"loss": 0.6262,
"num_input_tokens_seen": 41416448,
"step": 1305
},
{
"epoch": 0.559470424941277,
"grad_norm": 0.11905540525913239,
"learning_rate": 2.0347108226392285e-05,
"loss": 0.5474,
"num_input_tokens_seen": 41563552,
"step": 1310
},
{
"epoch": 0.5616058082425796,
"grad_norm": 0.09312383085489273,
"learning_rate": 2.0182396701328187e-05,
"loss": 0.7023,
"num_input_tokens_seen": 41713152,
"step": 1315
},
{
"epoch": 0.5637411915438821,
"grad_norm": 0.09516125172376633,
"learning_rate": 2.001790207965062e-05,
"loss": 0.8375,
"num_input_tokens_seen": 41901728,
"step": 1320
},
{
"epoch": 0.5658765748451847,
"grad_norm": 0.10551753640174866,
"learning_rate": 1.9853631767415737e-05,
"loss": 0.7857,
"num_input_tokens_seen": 42031776,
"step": 1325
},
{
"epoch": 0.5680119581464873,
"grad_norm": 0.09541548788547516,
"learning_rate": 1.9689593160580577e-05,
"loss": 0.7697,
"num_input_tokens_seen": 42196352,
"step": 1330
},
{
"epoch": 0.5701473414477899,
"grad_norm": 0.1404384821653366,
"learning_rate": 1.9525793644670094e-05,
"loss": 0.8586,
"num_input_tokens_seen": 42341088,
"step": 1335
},
{
"epoch": 0.5722827247490925,
"grad_norm": 0.1053939163684845,
"learning_rate": 1.93622405944446e-05,
"loss": 0.8365,
"num_input_tokens_seen": 42495424,
"step": 1340
},
{
"epoch": 0.5744181080503951,
"grad_norm": 0.1150602251291275,
"learning_rate": 1.9198941373567797e-05,
"loss": 0.6521,
"num_input_tokens_seen": 42622080,
"step": 1345
},
{
"epoch": 0.5765534913516976,
"grad_norm": 0.09714847803115845,
"learning_rate": 1.9035903334275186e-05,
"loss": 0.8343,
"num_input_tokens_seen": 42817472,
"step": 1350
},
{
"epoch": 0.5786888746530002,
"grad_norm": 0.11403302848339081,
"learning_rate": 1.887313381704308e-05,
"loss": 0.6469,
"num_input_tokens_seen": 42967968,
"step": 1355
},
{
"epoch": 0.5808242579543028,
"grad_norm": 0.10145643353462219,
"learning_rate": 1.871064015025808e-05,
"loss": 0.6199,
"num_input_tokens_seen": 43113120,
"step": 1360
},
{
"epoch": 0.5829596412556054,
"grad_norm": 0.12413822114467621,
"learning_rate": 1.8548429649887167e-05,
"loss": 0.6748,
"num_input_tokens_seen": 43311584,
"step": 1365
},
{
"epoch": 0.585095024556908,
"grad_norm": 0.10621116310358047,
"learning_rate": 1.8386509619148283e-05,
"loss": 0.6825,
"num_input_tokens_seen": 43468704,
"step": 1370
},
{
"epoch": 0.5872304078582106,
"grad_norm": 0.08581121265888214,
"learning_rate": 1.822488734818153e-05,
"loss": 0.7961,
"num_input_tokens_seen": 43629152,
"step": 1375
},
{
"epoch": 0.5893657911595132,
"grad_norm": 0.10057251155376434,
"learning_rate": 1.8063570113720955e-05,
"loss": 0.7024,
"num_input_tokens_seen": 43796384,
"step": 1380
},
{
"epoch": 0.5915011744608157,
"grad_norm": 0.145149365067482,
"learning_rate": 1.79025651787669e-05,
"loss": 0.7315,
"num_input_tokens_seen": 43972640,
"step": 1385
},
{
"epoch": 0.5936365577621183,
"grad_norm": 0.09588214010000229,
"learning_rate": 1.7741879792259033e-05,
"loss": 0.7955,
"num_input_tokens_seen": 44110080,
"step": 1390
},
{
"epoch": 0.5957719410634209,
"grad_norm": 0.10795921087265015,
"learning_rate": 1.7581521188749968e-05,
"loss": 0.8156,
"num_input_tokens_seen": 44270080,
"step": 1395
},
{
"epoch": 0.5979073243647235,
"grad_norm": 0.13513167202472687,
"learning_rate": 1.742149658807952e-05,
"loss": 0.688,
"num_input_tokens_seen": 44437280,
"step": 1400
},
{
"epoch": 0.6000427076660261,
"grad_norm": 0.0809662714600563,
"learning_rate": 1.7261813195049682e-05,
"loss": 0.7067,
"num_input_tokens_seen": 44579680,
"step": 1405
},
{
"epoch": 0.6021780909673287,
"grad_norm": 0.08051643520593643,
"learning_rate": 1.7102478199100218e-05,
"loss": 0.565,
"num_input_tokens_seen": 44788832,
"step": 1410
},
{
"epoch": 0.6043134742686312,
"grad_norm": 0.08201641589403152,
"learning_rate": 1.6943498773984974e-05,
"loss": 0.5555,
"num_input_tokens_seen": 44951488,
"step": 1415
},
{
"epoch": 0.6064488575699338,
"grad_norm": 0.07378476113080978,
"learning_rate": 1.678488207744891e-05,
"loss": 0.7106,
"num_input_tokens_seen": 45127232,
"step": 1420
},
{
"epoch": 0.6085842408712364,
"grad_norm": 0.08412224799394608,
"learning_rate": 1.6626635250905813e-05,
"loss": 0.8088,
"num_input_tokens_seen": 45290592,
"step": 1425
},
{
"epoch": 0.610719624172539,
"grad_norm": 0.09182008355855942,
"learning_rate": 1.646876541911679e-05,
"loss": 0.5566,
"num_input_tokens_seen": 45429920,
"step": 1430
},
{
"epoch": 0.6128550074738416,
"grad_norm": 0.11553499102592468,
"learning_rate": 1.6311279689869464e-05,
"loss": 0.6124,
"num_input_tokens_seen": 45612000,
"step": 1435
},
{
"epoch": 0.6149903907751442,
"grad_norm": 0.1281968653202057,
"learning_rate": 1.615418515365799e-05,
"loss": 0.764,
"num_input_tokens_seen": 45752192,
"step": 1440
},
{
"epoch": 0.6171257740764468,
"grad_norm": 0.11949111521244049,
"learning_rate": 1.5997488883363804e-05,
"loss": 0.6346,
"num_input_tokens_seen": 45927808,
"step": 1445
},
{
"epoch": 0.6192611573777493,
"grad_norm": 0.1383758783340454,
"learning_rate": 1.5841197933937164e-05,
"loss": 0.5827,
"num_input_tokens_seen": 46082432,
"step": 1450
},
{
"epoch": 0.6213965406790519,
"grad_norm": 0.09209062159061432,
"learning_rate": 1.568531934207955e-05,
"loss": 0.6316,
"num_input_tokens_seen": 46226688,
"step": 1455
},
{
"epoch": 0.6235319239803545,
"grad_norm": 0.16895094513893127,
"learning_rate": 1.552986012592681e-05,
"loss": 0.7383,
"num_input_tokens_seen": 46361216,
"step": 1460
},
{
"epoch": 0.6256673072816571,
"grad_norm": 0.07766853272914886,
"learning_rate": 1.5374827284733223e-05,
"loss": 0.598,
"num_input_tokens_seen": 46511840,
"step": 1465
},
{
"epoch": 0.6278026905829597,
"grad_norm": 0.09342877566814423,
"learning_rate": 1.5220227798556333e-05,
"loss": 0.6047,
"num_input_tokens_seen": 46635328,
"step": 1470
},
{
"epoch": 0.6299380738842623,
"grad_norm": 0.07859272509813309,
"learning_rate": 1.5066068627942714e-05,
"loss": 0.5981,
"num_input_tokens_seen": 46791520,
"step": 1475
},
{
"epoch": 0.6320734571855648,
"grad_norm": 0.0829625129699707,
"learning_rate": 1.4912356713614573e-05,
"loss": 0.9216,
"num_input_tokens_seen": 46964672,
"step": 1480
},
{
"epoch": 0.6342088404868674,
"grad_norm": 0.08610516041517258,
"learning_rate": 1.4759098976157227e-05,
"loss": 0.7327,
"num_input_tokens_seen": 47116864,
"step": 1485
},
{
"epoch": 0.63634422378817,
"grad_norm": 0.10078553855419159,
"learning_rate": 1.4606302315707587e-05,
"loss": 0.6273,
"num_input_tokens_seen": 47249824,
"step": 1490
},
{
"epoch": 0.6384796070894726,
"grad_norm": 0.10765385627746582,
"learning_rate": 1.4453973611643445e-05,
"loss": 0.6039,
"num_input_tokens_seen": 47405440,
"step": 1495
},
{
"epoch": 0.6406149903907752,
"grad_norm": 0.08604435622692108,
"learning_rate": 1.4302119722273727e-05,
"loss": 0.6372,
"num_input_tokens_seen": 47560960,
"step": 1500
},
{
"epoch": 0.6427503736920778,
"grad_norm": 0.09638124704360962,
"learning_rate": 1.4150747484529758e-05,
"loss": 0.5995,
"num_input_tokens_seen": 47726656,
"step": 1505
},
{
"epoch": 0.6448857569933804,
"grad_norm": 0.08920534700155258,
"learning_rate": 1.3999863713657405e-05,
"loss": 0.7475,
"num_input_tokens_seen": 47882784,
"step": 1510
},
{
"epoch": 0.6470211402946829,
"grad_norm": 0.10143899917602539,
"learning_rate": 1.3849475202910244e-05,
"loss": 0.7008,
"num_input_tokens_seen": 48048608,
"step": 1515
},
{
"epoch": 0.6491565235959855,
"grad_norm": 0.10630396008491516,
"learning_rate": 1.369958872324374e-05,
"loss": 0.5906,
"num_input_tokens_seen": 48167424,
"step": 1520
},
{
"epoch": 0.6512919068972881,
"grad_norm": 0.10320613533258438,
"learning_rate": 1.3550211023010346e-05,
"loss": 0.7876,
"num_input_tokens_seen": 48342048,
"step": 1525
},
{
"epoch": 0.6534272901985907,
"grad_norm": 0.10990385711193085,
"learning_rate": 1.3401348827655665e-05,
"loss": 0.6946,
"num_input_tokens_seen": 48519488,
"step": 1530
},
{
"epoch": 0.6555626734998933,
"grad_norm": 0.08516086637973785,
"learning_rate": 1.3253008839415726e-05,
"loss": 0.661,
"num_input_tokens_seen": 48671424,
"step": 1535
},
{
"epoch": 0.6576980568011959,
"grad_norm": 0.11356549710035324,
"learning_rate": 1.310519773701515e-05,
"loss": 0.6125,
"num_input_tokens_seen": 48796000,
"step": 1540
},
{
"epoch": 0.6598334401024984,
"grad_norm": 0.10029956698417664,
"learning_rate": 1.2957922175366493e-05,
"loss": 0.6231,
"num_input_tokens_seen": 48973024,
"step": 1545
},
{
"epoch": 0.661968823403801,
"grad_norm": 0.09604058414697647,
"learning_rate": 1.2811188785270617e-05,
"loss": 0.836,
"num_input_tokens_seen": 49140192,
"step": 1550
},
{
"epoch": 0.6641042067051036,
"grad_norm": 0.09177996963262558,
"learning_rate": 1.2665004173118136e-05,
"loss": 0.6581,
"num_input_tokens_seen": 49313920,
"step": 1555
},
{
"epoch": 0.6662395900064062,
"grad_norm": 0.10683578252792358,
"learning_rate": 1.2519374920591987e-05,
"loss": 0.6878,
"num_input_tokens_seen": 49480096,
"step": 1560
},
{
"epoch": 0.6683749733077088,
"grad_norm": 0.09613426774740219,
"learning_rate": 1.2374307584371104e-05,
"loss": 0.7337,
"num_input_tokens_seen": 49635936,
"step": 1565
},
{
"epoch": 0.6705103566090114,
"grad_norm": 0.08746462315320969,
"learning_rate": 1.222980869583521e-05,
"loss": 0.6751,
"num_input_tokens_seen": 49749408,
"step": 1570
},
{
"epoch": 0.672645739910314,
"grad_norm": 0.11159204691648483,
"learning_rate": 1.2085884760770755e-05,
"loss": 0.7597,
"num_input_tokens_seen": 49916512,
"step": 1575
},
{
"epoch": 0.6747811232116165,
"grad_norm": 0.08674119412899017,
"learning_rate": 1.1942542259078013e-05,
"loss": 0.7161,
"num_input_tokens_seen": 50054080,
"step": 1580
},
{
"epoch": 0.676916506512919,
"grad_norm": 0.0944414883852005,
"learning_rate": 1.1799787644479329e-05,
"loss": 0.6078,
"num_input_tokens_seen": 50209472,
"step": 1585
},
{
"epoch": 0.6790518898142216,
"grad_norm": 0.10381105542182922,
"learning_rate": 1.165762734422855e-05,
"loss": 0.7661,
"num_input_tokens_seen": 50374560,
"step": 1590
},
{
"epoch": 0.6811872731155242,
"grad_norm": 0.09648651629686356,
"learning_rate": 1.1516067758821658e-05,
"loss": 0.7189,
"num_input_tokens_seen": 50525632,
"step": 1595
},
{
"epoch": 0.6833226564168268,
"grad_norm": 0.10135359317064285,
"learning_rate": 1.13751152617086e-05,
"loss": 0.7739,
"num_input_tokens_seen": 50678080,
"step": 1600
},
{
"epoch": 0.6854580397181294,
"grad_norm": 0.09060854464769363,
"learning_rate": 1.1234776199006324e-05,
"loss": 0.8047,
"num_input_tokens_seen": 50845056,
"step": 1605
},
{
"epoch": 0.6875934230194319,
"grad_norm": 0.06740930676460266,
"learning_rate": 1.1095056889213073e-05,
"loss": 0.599,
"num_input_tokens_seen": 51008896,
"step": 1610
},
{
"epoch": 0.6897288063207345,
"grad_norm": 0.09671995788812637,
"learning_rate": 1.0955963622923896e-05,
"loss": 0.6548,
"num_input_tokens_seen": 51176448,
"step": 1615
},
{
"epoch": 0.6918641896220371,
"grad_norm": 0.0861692875623703,
"learning_rate": 1.0817502662547426e-05,
"loss": 0.6567,
"num_input_tokens_seen": 51347616,
"step": 1620
},
{
"epoch": 0.6939995729233397,
"grad_norm": 0.11806908249855042,
"learning_rate": 1.0679680242023946e-05,
"loss": 0.5926,
"num_input_tokens_seen": 51512000,
"step": 1625
},
{
"epoch": 0.6961349562246423,
"grad_norm": 0.10389918833971024,
"learning_rate": 1.0542502566544668e-05,
"loss": 0.8239,
"num_input_tokens_seen": 51659328,
"step": 1630
},
{
"epoch": 0.6982703395259449,
"grad_norm": 0.07497014105319977,
"learning_rate": 1.040597581227242e-05,
"loss": 0.7617,
"num_input_tokens_seen": 51806176,
"step": 1635
},
{
"epoch": 0.7004057228272474,
"grad_norm": 0.07773059606552124,
"learning_rate": 1.0270106126063539e-05,
"loss": 0.6469,
"num_input_tokens_seen": 51930816,
"step": 1640
},
{
"epoch": 0.70254110612855,
"grad_norm": 0.10639885812997818,
"learning_rate": 1.0134899625191124e-05,
"loss": 0.8937,
"num_input_tokens_seen": 52054944,
"step": 1645
},
{
"epoch": 0.7046764894298526,
"grad_norm": 0.09907250851392746,
"learning_rate": 1.0000362397069612e-05,
"loss": 0.6863,
"num_input_tokens_seen": 52213536,
"step": 1650
},
{
"epoch": 0.7068118727311552,
"grad_norm": 0.11581376940011978,
"learning_rate": 9.866500498980744e-06,
"loss": 0.6294,
"num_input_tokens_seen": 52366624,
"step": 1655
},
{
"epoch": 0.7089472560324578,
"grad_norm": 0.10165643692016602,
"learning_rate": 9.733319957800781e-06,
"loss": 0.644,
"num_input_tokens_seen": 52518688,
"step": 1660
},
{
"epoch": 0.7110826393337604,
"grad_norm": 0.09698858112096786,
"learning_rate": 9.60082676972921e-06,
"loss": 0.658,
"num_input_tokens_seen": 52656384,
"step": 1665
},
{
"epoch": 0.713218022635063,
"grad_norm": 0.1165652796626091,
"learning_rate": 9.469026900018758e-06,
"loss": 0.7008,
"num_input_tokens_seen": 52816832,
"step": 1670
},
{
"epoch": 0.7153534059363655,
"grad_norm": 0.11195079982280731,
"learning_rate": 9.337926282706794e-06,
"loss": 0.6814,
"num_input_tokens_seen": 52979936,
"step": 1675
},
{
"epoch": 0.7174887892376681,
"grad_norm": 0.07277271896600723,
"learning_rate": 9.20753082034821e-06,
"loss": 0.5933,
"num_input_tokens_seen": 53151136,
"step": 1680
},
{
"epoch": 0.7196241725389707,
"grad_norm": 0.11374859511852264,
"learning_rate": 9.077846383749631e-06,
"loss": 0.7048,
"num_input_tokens_seen": 53375680,
"step": 1685
},
{
"epoch": 0.7217595558402733,
"grad_norm": 0.08321022987365723,
"learning_rate": 8.948878811705109e-06,
"loss": 0.7039,
"num_input_tokens_seen": 53558240,
"step": 1690
},
{
"epoch": 0.7238949391415759,
"grad_norm": 0.09429024904966354,
"learning_rate": 8.820633910733237e-06,
"loss": 0.7525,
"num_input_tokens_seen": 53744960,
"step": 1695
},
{
"epoch": 0.7260303224428785,
"grad_norm": 0.09550992399454117,
"learning_rate": 8.693117454815728e-06,
"loss": 0.595,
"num_input_tokens_seen": 53884480,
"step": 1700
},
{
"epoch": 0.728165705744181,
"grad_norm": 0.09551380574703217,
"learning_rate": 8.566335185137437e-06,
"loss": 0.5853,
"num_input_tokens_seen": 54077792,
"step": 1705
},
{
"epoch": 0.7303010890454836,
"grad_norm": 0.10770967602729797,
"learning_rate": 8.440292809827898e-06,
"loss": 0.7973,
"num_input_tokens_seen": 54246368,
"step": 1710
},
{
"epoch": 0.7324364723467862,
"grad_norm": 0.12636590003967285,
"learning_rate": 8.314996003704305e-06,
"loss": 0.8046,
"num_input_tokens_seen": 54422240,
"step": 1715
},
{
"epoch": 0.7345718556480888,
"grad_norm": 0.10689777135848999,
"learning_rate": 8.190450408016032e-06,
"loss": 0.5263,
"num_input_tokens_seen": 54574592,
"step": 1720
},
{
"epoch": 0.7367072389493914,
"grad_norm": 0.09278780221939087,
"learning_rate": 8.06666163019063e-06,
"loss": 0.6577,
"num_input_tokens_seen": 54728160,
"step": 1725
},
{
"epoch": 0.738842622250694,
"grad_norm": 0.10053995251655579,
"learning_rate": 7.943635243581373e-06,
"loss": 0.6628,
"num_input_tokens_seen": 54895072,
"step": 1730
},
{
"epoch": 0.7409780055519966,
"grad_norm": 0.10549025237560272,
"learning_rate": 7.821376787216333e-06,
"loss": 0.6087,
"num_input_tokens_seen": 55072256,
"step": 1735
},
{
"epoch": 0.7431133888532991,
"grad_norm": 0.08755512535572052,
"learning_rate": 7.699891765548983e-06,
"loss": 0.6766,
"num_input_tokens_seen": 55237888,
"step": 1740
},
{
"epoch": 0.7452487721546017,
"grad_norm": 0.10339244455099106,
"learning_rate": 7.5791856482103765e-06,
"loss": 0.6222,
"num_input_tokens_seen": 55398048,
"step": 1745
},
{
"epoch": 0.7473841554559043,
"grad_norm": 0.09155864268541336,
"learning_rate": 7.459263869762892e-06,
"loss": 0.6083,
"num_input_tokens_seen": 55558336,
"step": 1750
},
{
"epoch": 0.7495195387572069,
"grad_norm": 0.11388752609491348,
"learning_rate": 7.340131829455541e-06,
"loss": 0.7643,
"num_input_tokens_seen": 55717888,
"step": 1755
},
{
"epoch": 0.7516549220585095,
"grad_norm": 0.0928613469004631,
"learning_rate": 7.221794890980888e-06,
"loss": 0.6745,
"num_input_tokens_seen": 55894816,
"step": 1760
},
{
"epoch": 0.7537903053598121,
"grad_norm": 0.09511938691139221,
"learning_rate": 7.104258382233556e-06,
"loss": 0.6846,
"num_input_tokens_seen": 56071360,
"step": 1765
},
{
"epoch": 0.7559256886611146,
"grad_norm": 0.07386107742786407,
"learning_rate": 6.987527595070356e-06,
"loss": 0.625,
"num_input_tokens_seen": 56188384,
"step": 1770
},
{
"epoch": 0.7580610719624172,
"grad_norm": 0.09641123563051224,
"learning_rate": 6.871607785071999e-06,
"loss": 0.6852,
"num_input_tokens_seen": 56365312,
"step": 1775
},
{
"epoch": 0.7601964552637198,
"grad_norm": 0.08215915411710739,
"learning_rate": 6.756504171306521e-06,
"loss": 0.7002,
"num_input_tokens_seen": 56509120,
"step": 1780
},
{
"epoch": 0.7623318385650224,
"grad_norm": 0.09883769601583481,
"learning_rate": 6.642221936094281e-06,
"loss": 0.7087,
"num_input_tokens_seen": 56652384,
"step": 1785
},
{
"epoch": 0.764467221866325,
"grad_norm": 0.09604239463806152,
"learning_rate": 6.528766224774619e-06,
"loss": 0.5355,
"num_input_tokens_seen": 56796704,
"step": 1790
},
{
"epoch": 0.7666026051676276,
"grad_norm": 0.0826464369893074,
"learning_rate": 6.416142145474244e-06,
"loss": 0.727,
"num_input_tokens_seen": 56975872,
"step": 1795
},
{
"epoch": 0.7687379884689302,
"grad_norm": 0.08775708824396133,
"learning_rate": 6.304354768877196e-06,
"loss": 0.7101,
"num_input_tokens_seen": 57147296,
"step": 1800
},
{
"epoch": 0.7708733717702327,
"grad_norm": 0.07710240036249161,
"learning_rate": 6.1934091279965915e-06,
"loss": 0.799,
"num_input_tokens_seen": 57302368,
"step": 1805
},
{
"epoch": 0.7730087550715353,
"grad_norm": 0.10319597274065018,
"learning_rate": 6.083310217947991e-06,
"loss": 0.6874,
"num_input_tokens_seen": 57471200,
"step": 1810
},
{
"epoch": 0.7751441383728379,
"grad_norm": 0.12237267196178436,
"learning_rate": 5.974062995724527e-06,
"loss": 0.7995,
"num_input_tokens_seen": 57679840,
"step": 1815
},
{
"epoch": 0.7772795216741405,
"grad_norm": 0.11243870854377747,
"learning_rate": 5.865672379973702e-06,
"loss": 0.6763,
"num_input_tokens_seen": 57849248,
"step": 1820
},
{
"epoch": 0.7794149049754431,
"grad_norm": 0.08665511757135391,
"learning_rate": 5.75814325077596e-06,
"loss": 0.5619,
"num_input_tokens_seen": 57993952,
"step": 1825
},
{
"epoch": 0.7815502882767457,
"grad_norm": 0.09945985674858093,
"learning_rate": 5.651480449424954e-06,
"loss": 0.6884,
"num_input_tokens_seen": 58146592,
"step": 1830
},
{
"epoch": 0.7836856715780482,
"grad_norm": 0.10123780369758606,
"learning_rate": 5.545688778209579e-06,
"loss": 0.7584,
"num_input_tokens_seen": 58307808,
"step": 1835
},
{
"epoch": 0.7858210548793508,
"grad_norm": 0.08710220456123352,
"learning_rate": 5.440773000197763e-06,
"loss": 0.7216,
"num_input_tokens_seen": 58462528,
"step": 1840
},
{
"epoch": 0.7879564381806534,
"grad_norm": 0.10483860224485397,
"learning_rate": 5.3367378390220184e-06,
"loss": 0.5983,
"num_input_tokens_seen": 58626784,
"step": 1845
},
{
"epoch": 0.790091821481956,
"grad_norm": 0.10729069262742996,
"learning_rate": 5.233587978666754e-06,
"loss": 0.5874,
"num_input_tokens_seen": 58805760,
"step": 1850
},
{
"epoch": 0.7922272047832586,
"grad_norm": 0.08131475001573563,
"learning_rate": 5.131328063257415e-06,
"loss": 0.6549,
"num_input_tokens_seen": 58989248,
"step": 1855
},
{
"epoch": 0.7943625880845612,
"grad_norm": 0.12807467579841614,
"learning_rate": 5.029962696851365e-06,
"loss": 0.7086,
"num_input_tokens_seen": 59127904,
"step": 1860
},
{
"epoch": 0.7964979713858638,
"grad_norm": 0.1114497184753418,
"learning_rate": 4.9294964432306105e-06,
"loss": 0.6751,
"num_input_tokens_seen": 59290880,
"step": 1865
},
{
"epoch": 0.7986333546871663,
"grad_norm": 0.0979105532169342,
"learning_rate": 4.829933825696328e-06,
"loss": 0.6631,
"num_input_tokens_seen": 59453504,
"step": 1870
},
{
"epoch": 0.8007687379884689,
"grad_norm": 0.10672794282436371,
"learning_rate": 4.731279326865193e-06,
"loss": 0.6248,
"num_input_tokens_seen": 59628704,
"step": 1875
},
{
"epoch": 0.8029041212897715,
"grad_norm": 0.09161815047264099,
"learning_rate": 4.633537388467582e-06,
"loss": 0.6742,
"num_input_tokens_seen": 59770720,
"step": 1880
},
{
"epoch": 0.8050395045910741,
"grad_norm": 0.10243742913007736,
"learning_rate": 4.536712411147573e-06,
"loss": 0.6084,
"num_input_tokens_seen": 59929280,
"step": 1885
},
{
"epoch": 0.8071748878923767,
"grad_norm": 0.09579010307788849,
"learning_rate": 4.4408087542648334e-06,
"loss": 0.7314,
"num_input_tokens_seen": 60045152,
"step": 1890
},
{
"epoch": 0.8093102711936793,
"grad_norm": 0.10613362491130829,
"learning_rate": 4.345830735698322e-06,
"loss": 0.6492,
"num_input_tokens_seen": 60163840,
"step": 1895
},
{
"epoch": 0.8114456544949818,
"grad_norm": 0.10478969663381577,
"learning_rate": 4.251782631651918e-06,
"loss": 0.7565,
"num_input_tokens_seen": 60329152,
"step": 1900
},
{
"epoch": 0.8135810377962844,
"grad_norm": 0.1022254079580307,
"learning_rate": 4.158668676461866e-06,
"loss": 0.6302,
"num_input_tokens_seen": 60451264,
"step": 1905
},
{
"epoch": 0.815716421097587,
"grad_norm": 0.12434552609920502,
"learning_rate": 4.0664930624061375e-06,
"loss": 0.6156,
"num_input_tokens_seen": 60607008,
"step": 1910
},
{
"epoch": 0.8178518043988896,
"grad_norm": 0.09911098331212997,
"learning_rate": 3.975259939515708e-06,
"loss": 0.6657,
"num_input_tokens_seen": 60764064,
"step": 1915
},
{
"epoch": 0.8199871877001922,
"grad_norm": 0.10193871706724167,
"learning_rate": 3.884973415387652e-06,
"loss": 0.834,
"num_input_tokens_seen": 60919072,
"step": 1920
},
{
"epoch": 0.8221225710014948,
"grad_norm": 0.09091677516698837,
"learning_rate": 3.79563755500027e-06,
"loss": 0.6426,
"num_input_tokens_seen": 61074976,
"step": 1925
},
{
"epoch": 0.8242579543027974,
"grad_norm": 0.09682322293519974,
"learning_rate": 3.7072563805300497e-06,
"loss": 0.7106,
"num_input_tokens_seen": 61209088,
"step": 1930
},
{
"epoch": 0.8263933376040999,
"grad_norm": 0.09818655252456665,
"learning_rate": 3.61983387117055e-06,
"loss": 0.724,
"num_input_tokens_seen": 61367360,
"step": 1935
},
{
"epoch": 0.8285287209054025,
"grad_norm": 0.0938807874917984,
"learning_rate": 3.533373962953271e-06,
"loss": 0.7054,
"num_input_tokens_seen": 61506976,
"step": 1940
},
{
"epoch": 0.8306641042067051,
"grad_norm": 0.09612589329481125,
"learning_rate": 3.447880548570434e-06,
"loss": 0.5991,
"num_input_tokens_seen": 61661280,
"step": 1945
},
{
"epoch": 0.8327994875080077,
"grad_norm": 0.10615026950836182,
"learning_rate": 3.3633574771997245e-06,
"loss": 0.8037,
"num_input_tokens_seen": 61813056,
"step": 1950
},
{
"epoch": 0.8349348708093103,
"grad_norm": 0.08966366946697235,
"learning_rate": 3.2798085543309847e-06,
"loss": 0.7369,
"num_input_tokens_seen": 61970752,
"step": 1955
},
{
"epoch": 0.8370702541106129,
"grad_norm": 0.10236942023038864,
"learning_rate": 3.1972375415948884e-06,
"loss": 0.5092,
"num_input_tokens_seen": 62085728,
"step": 1960
},
{
"epoch": 0.8392056374119155,
"grad_norm": 0.09586668014526367,
"learning_rate": 3.1156481565935563e-06,
"loss": 0.5488,
"num_input_tokens_seen": 62232288,
"step": 1965
},
{
"epoch": 0.841341020713218,
"grad_norm": 0.09763219207525253,
"learning_rate": 3.035044072733209e-06,
"loss": 0.8189,
"num_input_tokens_seen": 62418272,
"step": 1970
},
{
"epoch": 0.8434764040145206,
"grad_norm": 0.09863479435443878,
"learning_rate": 2.955428919058767e-06,
"loss": 0.7843,
"num_input_tokens_seen": 62560416,
"step": 1975
},
{
"epoch": 0.8456117873158232,
"grad_norm": 0.10871785879135132,
"learning_rate": 2.876806280090449e-06,
"loss": 0.6783,
"num_input_tokens_seen": 62713120,
"step": 1980
},
{
"epoch": 0.8477471706171258,
"grad_norm": 0.08632975071668625,
"learning_rate": 2.7991796956624017e-06,
"loss": 0.6642,
"num_input_tokens_seen": 62906304,
"step": 1985
},
{
"epoch": 0.8498825539184284,
"grad_norm": 0.11040724813938141,
"learning_rate": 2.7225526607633167e-06,
"loss": 0.697,
"num_input_tokens_seen": 63043552,
"step": 1990
},
{
"epoch": 0.852017937219731,
"grad_norm": 0.08328652381896973,
"learning_rate": 2.6469286253790777e-06,
"loss": 0.549,
"num_input_tokens_seen": 63192608,
"step": 1995
},
{
"epoch": 0.8541533205210335,
"grad_norm": 0.11789990216493607,
"learning_rate": 2.5723109943374264e-06,
"loss": 0.8259,
"num_input_tokens_seen": 63379296,
"step": 2000
},
{
"epoch": 0.8562887038223361,
"grad_norm": 0.08858389407396317,
"learning_rate": 2.4987031271546753e-06,
"loss": 0.6236,
"num_input_tokens_seen": 63540576,
"step": 2005
},
{
"epoch": 0.8584240871236387,
"grad_norm": 0.08800710737705231,
"learning_rate": 2.4261083378844557e-06,
"loss": 0.6153,
"num_input_tokens_seen": 63710688,
"step": 2010
},
{
"epoch": 0.8605594704249413,
"grad_norm": 0.11924576759338379,
"learning_rate": 2.354529894968485e-06,
"loss": 0.6785,
"num_input_tokens_seen": 63879584,
"step": 2015
},
{
"epoch": 0.8626948537262439,
"grad_norm": 0.08962240815162659,
"learning_rate": 2.2839710210894372e-06,
"loss": 0.6377,
"num_input_tokens_seen": 64015744,
"step": 2020
},
{
"epoch": 0.8648302370275465,
"grad_norm": 0.115207739174366,
"learning_rate": 2.214434893025838e-06,
"loss": 0.4801,
"num_input_tokens_seen": 64187232,
"step": 2025
},
{
"epoch": 0.866965620328849,
"grad_norm": 0.1438085287809372,
"learning_rate": 2.1459246415090312e-06,
"loss": 0.7073,
"num_input_tokens_seen": 64331968,
"step": 2030
},
{
"epoch": 0.8691010036301516,
"grad_norm": 0.12350678443908691,
"learning_rate": 2.078443351082232e-06,
"loss": 0.7264,
"num_input_tokens_seen": 64482816,
"step": 2035
},
{
"epoch": 0.8712363869314542,
"grad_norm": 0.1743326038122177,
"learning_rate": 2.011994059961647e-06,
"loss": 0.7054,
"num_input_tokens_seen": 64634368,
"step": 2040
},
{
"epoch": 0.8733717702327568,
"grad_norm": 0.10089342296123505,
"learning_rate": 1.9465797598996914e-06,
"loss": 0.7034,
"num_input_tokens_seen": 64787424,
"step": 2045
},
{
"epoch": 0.8755071535340594,
"grad_norm": 0.10029490292072296,
"learning_rate": 1.8822033960502722e-06,
"loss": 0.593,
"num_input_tokens_seen": 64935616,
"step": 2050
},
{
"epoch": 0.877642536835362,
"grad_norm": 0.13283027708530426,
"learning_rate": 1.8188678668362102e-06,
"loss": 0.7639,
"num_input_tokens_seen": 65103392,
"step": 2055
},
{
"epoch": 0.8797779201366646,
"grad_norm": 0.10776066035032272,
"learning_rate": 1.7565760238187401e-06,
"loss": 0.6378,
"num_input_tokens_seen": 65236032,
"step": 2060
},
{
"epoch": 0.8819133034379671,
"grad_norm": 0.11559037119150162,
"learning_rate": 1.6953306715690925e-06,
"loss": 0.572,
"num_input_tokens_seen": 65374432,
"step": 2065
},
{
"epoch": 0.8840486867392697,
"grad_norm": 0.10408779978752136,
"learning_rate": 1.6351345675422874e-06,
"loss": 0.6153,
"num_input_tokens_seen": 65554048,
"step": 2070
},
{
"epoch": 0.8861840700405723,
"grad_norm": 0.1286764293909073,
"learning_rate": 1.5759904219529249e-06,
"loss": 0.7024,
"num_input_tokens_seen": 65719584,
"step": 2075
},
{
"epoch": 0.8883194533418749,
"grad_norm": 0.10344738513231277,
"learning_rate": 1.5179008976531878e-06,
"loss": 0.6698,
"num_input_tokens_seen": 65911616,
"step": 2080
},
{
"epoch": 0.8904548366431775,
"grad_norm": 0.10264074802398682,
"learning_rate": 1.4608686100129553e-06,
"loss": 0.7602,
"num_input_tokens_seen": 66080480,
"step": 2085
},
{
"epoch": 0.8925902199444801,
"grad_norm": 0.08111412823200226,
"learning_rate": 1.4048961268020384e-06,
"loss": 0.5967,
"num_input_tokens_seen": 66237376,
"step": 2090
},
{
"epoch": 0.8947256032457827,
"grad_norm": 0.10298115760087967,
"learning_rate": 1.3499859680745852e-06,
"loss": 0.7729,
"num_input_tokens_seen": 66404128,
"step": 2095
},
{
"epoch": 0.8968609865470852,
"grad_norm": 0.13184696435928345,
"learning_rate": 1.2961406060556097e-06,
"loss": 0.7682,
"num_input_tokens_seen": 66587872,
"step": 2100
},
{
"epoch": 0.8989963698483878,
"grad_norm": 0.1918001025915146,
"learning_rate": 1.2433624650296905e-06,
"loss": 0.8945,
"num_input_tokens_seen": 66708672,
"step": 2105
},
{
"epoch": 0.9011317531496904,
"grad_norm": 0.09620165079832077,
"learning_rate": 1.191653921231811e-06,
"loss": 0.5526,
"num_input_tokens_seen": 66862912,
"step": 2110
},
{
"epoch": 0.903267136450993,
"grad_norm": 0.11877051740884781,
"learning_rate": 1.1410173027403882e-06,
"loss": 0.6192,
"num_input_tokens_seen": 66976480,
"step": 2115
},
{
"epoch": 0.9054025197522956,
"grad_norm": 0.08998879045248032,
"learning_rate": 1.0914548893724563e-06,
"loss": 0.6662,
"num_input_tokens_seen": 67155712,
"step": 2120
},
{
"epoch": 0.9075379030535982,
"grad_norm": 0.10443535447120667,
"learning_rate": 1.042968912581005e-06,
"loss": 0.6332,
"num_input_tokens_seen": 67288000,
"step": 2125
},
{
"epoch": 0.9096732863549007,
"grad_norm": 0.10673485696315765,
"learning_rate": 9.955615553545295e-07,
"loss": 0.8033,
"num_input_tokens_seen": 67437632,
"step": 2130
},
{
"epoch": 0.9118086696562033,
"grad_norm": 0.07716970145702362,
"learning_rate": 9.492349521187355e-07,
"loss": 0.6562,
"num_input_tokens_seen": 67584288,
"step": 2135
},
{
"epoch": 0.9139440529575059,
"grad_norm": 0.13065995275974274,
"learning_rate": 9.039911886404462e-07,
"loss": 0.6629,
"num_input_tokens_seen": 67741024,
"step": 2140
},
{
"epoch": 0.9160794362588085,
"grad_norm": 0.09159885346889496,
"learning_rate": 8.59832301933694e-07,
"loss": 0.7827,
"num_input_tokens_seen": 67904928,
"step": 2145
},
{
"epoch": 0.9182148195601111,
"grad_norm": 0.09007798880338669,
"learning_rate": 8.16760280168008e-07,
"loss": 0.6068,
"num_input_tokens_seen": 68084128,
"step": 2150
},
{
"epoch": 0.9203502028614137,
"grad_norm": 0.0953405424952507,
"learning_rate": 7.747770625788964e-07,
"loss": 0.6923,
"num_input_tokens_seen": 68252704,
"step": 2155
},
{
"epoch": 0.9224855861627163,
"grad_norm": 0.10631420463323593,
"learning_rate": 7.338845393805388e-07,
"loss": 0.6895,
"num_input_tokens_seen": 68375360,
"step": 2160
},
{
"epoch": 0.9246209694640188,
"grad_norm": 0.12354771047830582,
"learning_rate": 6.940845516806849e-07,
"loss": 0.721,
"num_input_tokens_seen": 68542272,
"step": 2165
},
{
"epoch": 0.9267563527653214,
"grad_norm": 0.07840372622013092,
"learning_rate": 6.553788913977593e-07,
"loss": 0.7807,
"num_input_tokens_seen": 68703584,
"step": 2170
},
{
"epoch": 0.928891736066624,
"grad_norm": 0.09137984365224838,
"learning_rate": 6.177693011801877e-07,
"loss": 0.6796,
"num_input_tokens_seen": 68845760,
"step": 2175
},
{
"epoch": 0.9310271193679266,
"grad_norm": 0.0817914679646492,
"learning_rate": 5.812574743279286e-07,
"loss": 0.6509,
"num_input_tokens_seen": 69031072,
"step": 2180
},
{
"epoch": 0.9331625026692292,
"grad_norm": 0.13196462392807007,
"learning_rate": 5.458450547162486e-07,
"loss": 0.7432,
"num_input_tokens_seen": 69207200,
"step": 2185
},
{
"epoch": 0.9352978859705318,
"grad_norm": 0.08276062458753586,
"learning_rate": 5.115336367217005e-07,
"loss": 0.6785,
"num_input_tokens_seen": 69374944,
"step": 2190
},
{
"epoch": 0.9374332692718343,
"grad_norm": 0.1287129819393158,
"learning_rate": 4.783247651503398e-07,
"loss": 0.561,
"num_input_tokens_seen": 69527520,
"step": 2195
},
{
"epoch": 0.9395686525731369,
"grad_norm": 0.1089451014995575,
"learning_rate": 4.4621993516818227e-07,
"loss": 0.6363,
"num_input_tokens_seen": 69688256,
"step": 2200
},
{
"epoch": 0.9417040358744395,
"grad_norm": 0.10153474658727646,
"learning_rate": 4.152205922338698e-07,
"loss": 0.6927,
"num_input_tokens_seen": 69851776,
"step": 2205
},
{
"epoch": 0.9438394191757421,
"grad_norm": 0.1068960577249527,
"learning_rate": 3.8532813203360775e-07,
"loss": 0.6462,
"num_input_tokens_seen": 70017856,
"step": 2210
},
{
"epoch": 0.9459748024770446,
"grad_norm": 0.08118557184934616,
"learning_rate": 3.565439004183241e-07,
"loss": 0.6962,
"num_input_tokens_seen": 70153888,
"step": 2215
},
{
"epoch": 0.9481101857783472,
"grad_norm": 0.10270223766565323,
"learning_rate": 3.288691933430621e-07,
"loss": 0.6935,
"num_input_tokens_seen": 70292832,
"step": 2220
},
{
"epoch": 0.9502455690796497,
"grad_norm": 0.09596558660268784,
"learning_rate": 3.023052568086493e-07,
"loss": 0.6684,
"num_input_tokens_seen": 70448448,
"step": 2225
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.13479304313659668,
"learning_rate": 2.768532868055923e-07,
"loss": 0.7058,
"num_input_tokens_seen": 70595488,
"step": 2230
},
{
"epoch": 0.9545163356822549,
"grad_norm": 0.10255875438451767,
"learning_rate": 2.5251442926021715e-07,
"loss": 0.6543,
"num_input_tokens_seen": 70756416,
"step": 2235
},
{
"epoch": 0.9566517189835575,
"grad_norm": 0.09057821333408356,
"learning_rate": 2.292897799831051e-07,
"loss": 0.727,
"num_input_tokens_seen": 70944896,
"step": 2240
},
{
"epoch": 0.9587871022848601,
"grad_norm": 0.10975092649459839,
"learning_rate": 2.0718038461972345e-07,
"loss": 0.6602,
"num_input_tokens_seen": 71083616,
"step": 2245
},
{
"epoch": 0.9609224855861627,
"grad_norm": 0.11111032962799072,
"learning_rate": 1.8618723860336916e-07,
"loss": 0.6301,
"num_input_tokens_seen": 71240480,
"step": 2250
},
{
"epoch": 0.9630578688874653,
"grad_norm": 0.11888109892606735,
"learning_rate": 1.663112871103406e-07,
"loss": 0.6893,
"num_input_tokens_seen": 71427648,
"step": 2255
},
{
"epoch": 0.9651932521887678,
"grad_norm": 0.11165483295917511,
"learning_rate": 1.4755342501739377e-07,
"loss": 0.6536,
"num_input_tokens_seen": 71591648,
"step": 2260
},
{
"epoch": 0.9673286354900704,
"grad_norm": 0.07350827753543854,
"learning_rate": 1.2991449686143852e-07,
"loss": 0.7046,
"num_input_tokens_seen": 71735296,
"step": 2265
},
{
"epoch": 0.969464018791373,
"grad_norm": 0.12128807604312897,
"learning_rate": 1.1339529680152173e-07,
"loss": 0.559,
"num_input_tokens_seen": 71861920,
"step": 2270
},
{
"epoch": 0.9715994020926756,
"grad_norm": 0.11065730452537537,
"learning_rate": 9.799656858307527e-08,
"loss": 0.7401,
"num_input_tokens_seen": 72029568,
"step": 2275
},
{
"epoch": 0.9737347853939782,
"grad_norm": 0.10507506877183914,
"learning_rate": 8.37190055044207e-08,
"loss": 0.5554,
"num_input_tokens_seen": 72166464,
"step": 2280
},
{
"epoch": 0.9758701686952808,
"grad_norm": 0.10541801899671555,
"learning_rate": 7.056325038556911e-08,
"loss": 0.6366,
"num_input_tokens_seen": 72326496,
"step": 2285
},
{
"epoch": 0.9780055519965833,
"grad_norm": 0.09389659017324448,
"learning_rate": 5.8529895539266575e-08,
"loss": 0.5862,
"num_input_tokens_seen": 72443616,
"step": 2290
},
{
"epoch": 0.9801409352978859,
"grad_norm": 0.08955533802509308,
"learning_rate": 4.7619482744326595e-08,
"loss": 0.7018,
"num_input_tokens_seen": 72624032,
"step": 2295
},
{
"epoch": 0.9822763185991885,
"grad_norm": 0.1084001362323761,
"learning_rate": 3.7832503221249535e-08,
"loss": 0.642,
"num_input_tokens_seen": 72797184,
"step": 2300
},
{
"epoch": 0.9844117019004911,
"grad_norm": 0.11596699804067612,
"learning_rate": 2.916939761009041e-08,
"loss": 0.6432,
"num_input_tokens_seen": 72941600,
"step": 2305
},
{
"epoch": 0.9865470852017937,
"grad_norm": 0.09869848936796188,
"learning_rate": 2.1630555950635788e-08,
"loss": 0.5893,
"num_input_tokens_seen": 73116000,
"step": 2310
},
{
"epoch": 0.9886824685030963,
"grad_norm": 0.08685341477394104,
"learning_rate": 1.5216317664829004e-08,
"loss": 0.6636,
"num_input_tokens_seen": 73285504,
"step": 2315
},
{
"epoch": 0.9908178518043989,
"grad_norm": 0.10240475088357925,
"learning_rate": 9.926971541496244e-09,
"loss": 0.7166,
"num_input_tokens_seen": 73440544,
"step": 2320
},
{
"epoch": 0.9929532351057014,
"grad_norm": 0.09111962467432022,
"learning_rate": 5.762755723348612e-09,
"loss": 0.4921,
"num_input_tokens_seen": 73549760,
"step": 2325
},
{
"epoch": 0.995088618407004,
"grad_norm": 0.09901441633701324,
"learning_rate": 2.7238576962435034e-09,
"loss": 0.6256,
"num_input_tokens_seen": 73709952,
"step": 2330
},
{
"epoch": 0.9972240017083066,
"grad_norm": 0.11895426362752914,
"learning_rate": 8.104142807663361e-10,
"loss": 0.7422,
"num_input_tokens_seen": 73907360,
"step": 2335
},
{
"epoch": 0.9993593850096092,
"grad_norm": 0.12102843821048737,
"learning_rate": 2.2511626046606283e-11,
"loss": 0.656,
"num_input_tokens_seen": 74053280,
"step": 2340
},
{
"epoch": 0.9997864616698697,
"num_input_tokens_seen": 74083488,
"step": 2341,
"total_flos": 4.6864422704480256e+17,
"train_loss": 0.692321076499046,
"train_runtime": 65496.9949,
"train_samples_per_second": 1.144,
"train_steps_per_second": 0.036
}
],
"logging_steps": 5,
"max_steps": 2341,
"num_input_tokens_seen": 74083488,
"num_train_epochs": 1,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.6864422704480256e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}