yalhessi's picture
Training in progress, epoch 11, checkpoint
97c8e02 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 11.0,
"eval_steps": 720,
"global_step": 39589,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.13892747985551543,
"grad_norm": 0.4299773573875427,
"learning_rate": 0.0003953968695007873,
"loss": 0.2967,
"step": 500
},
{
"epoch": 0.2000555709919422,
"eval_loss": 0.1983761489391327,
"eval_runtime": 16.4512,
"eval_samples_per_second": 30.393,
"eval_steps_per_second": 3.83,
"step": 720
},
{
"epoch": 0.27785495971103086,
"grad_norm": 0.5706949830055237,
"learning_rate": 0.0003907659535056034,
"loss": 0.1959,
"step": 1000
},
{
"epoch": 0.4001111419838844,
"eval_loss": 0.17383727431297302,
"eval_runtime": 16.4842,
"eval_samples_per_second": 30.332,
"eval_steps_per_second": 3.822,
"step": 1440
},
{
"epoch": 0.41678243956654626,
"grad_norm": 0.5392869710922241,
"learning_rate": 0.0003861350375104196,
"loss": 0.1723,
"step": 1500
},
{
"epoch": 0.5557099194220617,
"grad_norm": 0.5005412697792053,
"learning_rate": 0.0003815041215152357,
"loss": 0.1636,
"step": 2000
},
{
"epoch": 0.6001667129758266,
"eval_loss": 0.16229495406150818,
"eval_runtime": 16.4963,
"eval_samples_per_second": 30.31,
"eval_steps_per_second": 3.819,
"step": 2160
},
{
"epoch": 0.6946373992775771,
"grad_norm": 0.370914489030838,
"learning_rate": 0.0003768732055200519,
"loss": 0.1539,
"step": 2500
},
{
"epoch": 0.8002222839677688,
"eval_loss": 0.1537286937236786,
"eval_runtime": 16.4662,
"eval_samples_per_second": 30.365,
"eval_steps_per_second": 3.826,
"step": 2880
},
{
"epoch": 0.8335648791330925,
"grad_norm": 0.41378697752952576,
"learning_rate": 0.000372242289524868,
"loss": 0.1445,
"step": 3000
},
{
"epoch": 0.972492358988608,
"grad_norm": 0.4000154137611389,
"learning_rate": 0.0003676113735296842,
"loss": 0.1384,
"step": 3500
},
{
"epoch": 1.000277854959711,
"eval_loss": 0.15115150809288025,
"eval_runtime": 16.8021,
"eval_samples_per_second": 29.758,
"eval_steps_per_second": 3.75,
"step": 3600
},
{
"epoch": 1.1114198388441234,
"grad_norm": 0.4774770140647888,
"learning_rate": 0.00036298045753450036,
"loss": 0.1317,
"step": 4000
},
{
"epoch": 1.2003334259516532,
"eval_loss": 0.14330309629440308,
"eval_runtime": 16.4814,
"eval_samples_per_second": 30.337,
"eval_steps_per_second": 3.822,
"step": 4320
},
{
"epoch": 1.2503473186996388,
"grad_norm": 0.4988621175289154,
"learning_rate": 0.0003583495415393165,
"loss": 0.1283,
"step": 4500
},
{
"epoch": 1.3892747985551543,
"grad_norm": 0.5688238739967346,
"learning_rate": 0.000353727887376123,
"loss": 0.1237,
"step": 5000
},
{
"epoch": 1.4003889969435954,
"eval_loss": 0.13683326542377472,
"eval_runtime": 16.5065,
"eval_samples_per_second": 30.291,
"eval_steps_per_second": 3.817,
"step": 5040
},
{
"epoch": 1.5282022784106695,
"grad_norm": 0.5433902740478516,
"learning_rate": 0.00034909697138093914,
"loss": 0.1174,
"step": 5500
},
{
"epoch": 1.6004445679355377,
"eval_loss": 0.13955478370189667,
"eval_runtime": 16.4989,
"eval_samples_per_second": 30.305,
"eval_steps_per_second": 3.818,
"step": 5760
},
{
"epoch": 1.667129758266185,
"grad_norm": 0.5300644040107727,
"learning_rate": 0.0003444660553857553,
"loss": 0.1178,
"step": 6000
},
{
"epoch": 1.8005001389274797,
"eval_loss": 0.13158197700977325,
"eval_runtime": 16.533,
"eval_samples_per_second": 30.243,
"eval_steps_per_second": 3.811,
"step": 6480
},
{
"epoch": 1.8060572381217006,
"grad_norm": 0.3573897182941437,
"learning_rate": 0.0003398351393905715,
"loss": 0.113,
"step": 6500
},
{
"epoch": 1.9449847179772157,
"grad_norm": 0.5019258260726929,
"learning_rate": 0.00033520422339538766,
"loss": 0.1134,
"step": 7000
},
{
"epoch": 2.000555709919422,
"eval_loss": 0.13124322891235352,
"eval_runtime": 16.0696,
"eval_samples_per_second": 31.115,
"eval_steps_per_second": 3.92,
"step": 7200
},
{
"epoch": 2.0839121978327313,
"grad_norm": 0.4890081286430359,
"learning_rate": 0.0003305733074002038,
"loss": 0.1075,
"step": 7500
},
{
"epoch": 2.2006112809113643,
"eval_loss": 0.12690122425556183,
"eval_runtime": 15.8697,
"eval_samples_per_second": 31.507,
"eval_steps_per_second": 3.97,
"step": 7920
},
{
"epoch": 2.222839677688247,
"grad_norm": 0.5411983132362366,
"learning_rate": 0.0003259423914050199,
"loss": 0.104,
"step": 8000
},
{
"epoch": 2.361767157543762,
"grad_norm": 0.892245352268219,
"learning_rate": 0.00032132073724182645,
"loss": 0.1018,
"step": 8500
},
{
"epoch": 2.4006668519033063,
"eval_loss": 0.1253955215215683,
"eval_runtime": 15.9212,
"eval_samples_per_second": 31.405,
"eval_steps_per_second": 3.957,
"step": 8640
},
{
"epoch": 2.5006946373992776,
"grad_norm": 0.5154420137405396,
"learning_rate": 0.0003166898212466426,
"loss": 0.1018,
"step": 9000
},
{
"epoch": 2.600722422895249,
"eval_loss": 0.1270376443862915,
"eval_runtime": 15.8556,
"eval_samples_per_second": 31.535,
"eval_steps_per_second": 3.973,
"step": 9360
},
{
"epoch": 2.639622117254793,
"grad_norm": 0.4247698187828064,
"learning_rate": 0.0003120681670834491,
"loss": 0.0988,
"step": 9500
},
{
"epoch": 2.7785495971103087,
"grad_norm": 0.6174339652061462,
"learning_rate": 0.0003074372510882653,
"loss": 0.0931,
"step": 10000
},
{
"epoch": 2.800777993887191,
"eval_loss": 0.12492711842060089,
"eval_runtime": 15.9536,
"eval_samples_per_second": 31.341,
"eval_steps_per_second": 3.949,
"step": 10080
},
{
"epoch": 2.917477076965824,
"grad_norm": 0.3945905864238739,
"learning_rate": 0.0003028063350930814,
"loss": 0.0924,
"step": 10500
},
{
"epoch": 3.000833564879133,
"eval_loss": 0.12177152931690216,
"eval_runtime": 16.5123,
"eval_samples_per_second": 30.28,
"eval_steps_per_second": 3.815,
"step": 10800
},
{
"epoch": 3.0564045568213394,
"grad_norm": 0.4349508285522461,
"learning_rate": 0.0002981754190978976,
"loss": 0.0929,
"step": 11000
},
{
"epoch": 3.1953320366768545,
"grad_norm": 0.5195356011390686,
"learning_rate": 0.00029354450310271375,
"loss": 0.0897,
"step": 11500
},
{
"epoch": 3.2008891358710754,
"eval_loss": 0.12157219648361206,
"eval_runtime": 15.887,
"eval_samples_per_second": 31.472,
"eval_steps_per_second": 3.965,
"step": 11520
},
{
"epoch": 3.33425951653237,
"grad_norm": 0.38773760199546814,
"learning_rate": 0.0002889135871075299,
"loss": 0.0868,
"step": 12000
},
{
"epoch": 3.4009447068630174,
"eval_loss": 0.12406055629253387,
"eval_runtime": 15.9444,
"eval_samples_per_second": 31.359,
"eval_steps_per_second": 3.951,
"step": 12240
},
{
"epoch": 3.4731869963878856,
"grad_norm": 0.3054683804512024,
"learning_rate": 0.00028428267111234605,
"loss": 0.0865,
"step": 12500
},
{
"epoch": 3.6010002778549595,
"eval_loss": 0.11476034671068192,
"eval_runtime": 15.9006,
"eval_samples_per_second": 31.445,
"eval_steps_per_second": 3.962,
"step": 12960
},
{
"epoch": 3.612114476243401,
"grad_norm": 0.5311923623085022,
"learning_rate": 0.0002796610169491526,
"loss": 0.0845,
"step": 13000
},
{
"epoch": 3.7510419560989163,
"grad_norm": 0.7641647458076477,
"learning_rate": 0.0002750301009539687,
"loss": 0.084,
"step": 13500
},
{
"epoch": 3.801055848846902,
"eval_loss": 0.11587072908878326,
"eval_runtime": 15.933,
"eval_samples_per_second": 31.381,
"eval_steps_per_second": 3.954,
"step": 13680
},
{
"epoch": 3.889969435954432,
"grad_norm": 0.5842312574386597,
"learning_rate": 0.00027039918495878483,
"loss": 0.0815,
"step": 14000
},
{
"epoch": 4.001111419838844,
"eval_loss": 0.11761430650949478,
"eval_runtime": 16.0803,
"eval_samples_per_second": 31.094,
"eval_steps_per_second": 3.918,
"step": 14400
},
{
"epoch": 4.0288969158099475,
"grad_norm": 0.5182059407234192,
"learning_rate": 0.000265768268963601,
"loss": 0.0823,
"step": 14500
},
{
"epoch": 4.167824395665463,
"grad_norm": 0.3954576253890991,
"learning_rate": 0.0002611373529684172,
"loss": 0.0753,
"step": 15000
},
{
"epoch": 4.201166990830786,
"eval_loss": 0.11391445249319077,
"eval_runtime": 15.9483,
"eval_samples_per_second": 31.351,
"eval_steps_per_second": 3.95,
"step": 15120
},
{
"epoch": 4.306751875520978,
"grad_norm": 0.5974435210227966,
"learning_rate": 0.00025650643697323335,
"loss": 0.0762,
"step": 15500
},
{
"epoch": 4.4012225618227285,
"eval_loss": 0.11403658986091614,
"eval_runtime": 15.92,
"eval_samples_per_second": 31.407,
"eval_steps_per_second": 3.957,
"step": 15840
},
{
"epoch": 4.445679355376494,
"grad_norm": 0.4496535360813141,
"learning_rate": 0.0002518755209780495,
"loss": 0.0737,
"step": 16000
},
{
"epoch": 4.584606835232009,
"grad_norm": 0.5617558360099792,
"learning_rate": 0.0002472446049828656,
"loss": 0.074,
"step": 16500
},
{
"epoch": 4.601278132814671,
"eval_loss": 0.11306341737508774,
"eval_runtime": 15.9244,
"eval_samples_per_second": 31.398,
"eval_steps_per_second": 3.956,
"step": 16560
},
{
"epoch": 4.723534315087524,
"grad_norm": 0.5999208092689514,
"learning_rate": 0.00024261368898768177,
"loss": 0.0732,
"step": 17000
},
{
"epoch": 4.801333703806613,
"eval_loss": 0.11077062785625458,
"eval_runtime": 15.9311,
"eval_samples_per_second": 31.385,
"eval_steps_per_second": 3.955,
"step": 17280
},
{
"epoch": 4.86246179494304,
"grad_norm": 0.3961442708969116,
"learning_rate": 0.0002379920348244883,
"loss": 0.0724,
"step": 17500
},
{
"epoch": 5.001389274798555,
"grad_norm": 0.507563054561615,
"learning_rate": 0.00023336111882930443,
"loss": 0.0685,
"step": 18000
},
{
"epoch": 5.001389274798555,
"eval_loss": 0.11523561179637909,
"eval_runtime": 16.1329,
"eval_samples_per_second": 30.992,
"eval_steps_per_second": 3.905,
"step": 18000
},
{
"epoch": 5.14031675465407,
"grad_norm": 0.5651789307594299,
"learning_rate": 0.00022873020283412058,
"loss": 0.0655,
"step": 18500
},
{
"epoch": 5.201444845790498,
"eval_loss": 0.11398093402385712,
"eval_runtime": 15.9337,
"eval_samples_per_second": 31.38,
"eval_steps_per_second": 3.954,
"step": 18720
},
{
"epoch": 5.279244234509586,
"grad_norm": 0.619132399559021,
"learning_rate": 0.00022409928683893675,
"loss": 0.0664,
"step": 19000
},
{
"epoch": 5.401500416782439,
"eval_loss": 0.11203750967979431,
"eval_runtime": 15.8871,
"eval_samples_per_second": 31.472,
"eval_steps_per_second": 3.965,
"step": 19440
},
{
"epoch": 5.418171714365101,
"grad_norm": 0.4724760353565216,
"learning_rate": 0.0002194683708437529,
"loss": 0.0636,
"step": 19500
},
{
"epoch": 5.5570991942206165,
"grad_norm": 0.5861866474151611,
"learning_rate": 0.00021483745484856908,
"loss": 0.0648,
"step": 20000
},
{
"epoch": 5.601555987774382,
"eval_loss": 0.1131061241030693,
"eval_runtime": 15.952,
"eval_samples_per_second": 31.344,
"eval_steps_per_second": 3.949,
"step": 20160
},
{
"epoch": 5.6960266740761325,
"grad_norm": 0.5262423157691956,
"learning_rate": 0.0002102065388533852,
"loss": 0.063,
"step": 20500
},
{
"epoch": 5.801611558766324,
"eval_loss": 0.11248422414064407,
"eval_runtime": 15.9153,
"eval_samples_per_second": 31.416,
"eval_steps_per_second": 3.958,
"step": 20880
},
{
"epoch": 5.834954153931648,
"grad_norm": 0.6074294447898865,
"learning_rate": 0.00020557562285820135,
"loss": 0.0624,
"step": 21000
},
{
"epoch": 5.973881633787163,
"grad_norm": 0.5349674820899963,
"learning_rate": 0.0002009539686950079,
"loss": 0.0609,
"step": 21500
},
{
"epoch": 6.001667129758266,
"eval_loss": 0.11405794322490692,
"eval_runtime": 16.1656,
"eval_samples_per_second": 30.93,
"eval_steps_per_second": 3.897,
"step": 21600
},
{
"epoch": 6.112809113642679,
"grad_norm": 0.36713194847106934,
"learning_rate": 0.00019632305269982403,
"loss": 0.0576,
"step": 22000
},
{
"epoch": 6.201722700750208,
"eval_loss": 0.11051186919212341,
"eval_runtime": 15.9395,
"eval_samples_per_second": 31.369,
"eval_steps_per_second": 3.952,
"step": 22320
},
{
"epoch": 6.251736593498194,
"grad_norm": 0.4714512526988983,
"learning_rate": 0.00019169213670464018,
"loss": 0.057,
"step": 22500
},
{
"epoch": 6.390664073353709,
"grad_norm": 0.3419685363769531,
"learning_rate": 0.00018706122070945633,
"loss": 0.0572,
"step": 23000
},
{
"epoch": 6.401778271742151,
"eval_loss": 0.1142740249633789,
"eval_runtime": 15.9346,
"eval_samples_per_second": 31.378,
"eval_steps_per_second": 3.954,
"step": 23040
},
{
"epoch": 6.529591553209225,
"grad_norm": 0.43148958683013916,
"learning_rate": 0.0001824303047142725,
"loss": 0.0554,
"step": 23500
},
{
"epoch": 6.601833842734093,
"eval_loss": 0.1115042194724083,
"eval_runtime": 15.9938,
"eval_samples_per_second": 31.262,
"eval_steps_per_second": 3.939,
"step": 23760
},
{
"epoch": 6.66851903306474,
"grad_norm": 0.5623305439949036,
"learning_rate": 0.00017779938871908863,
"loss": 0.0538,
"step": 24000
},
{
"epoch": 6.801889413726035,
"eval_loss": 0.11134042590856552,
"eval_runtime": 15.9452,
"eval_samples_per_second": 31.357,
"eval_steps_per_second": 3.951,
"step": 24480
},
{
"epoch": 6.807446512920255,
"grad_norm": 0.4066413640975952,
"learning_rate": 0.0001731684727239048,
"loss": 0.0534,
"step": 24500
},
{
"epoch": 6.946373992775771,
"grad_norm": 0.4021354019641876,
"learning_rate": 0.00016853755672872095,
"loss": 0.052,
"step": 25000
},
{
"epoch": 7.001944984717977,
"eval_loss": 0.1132456511259079,
"eval_runtime": 16.1483,
"eval_samples_per_second": 30.963,
"eval_steps_per_second": 3.901,
"step": 25200
},
{
"epoch": 7.085301472631286,
"grad_norm": 0.46669623255729675,
"learning_rate": 0.00016391590256552746,
"loss": 0.0498,
"step": 25500
},
{
"epoch": 7.20200055570992,
"eval_loss": 0.11319959908723831,
"eval_runtime": 15.9896,
"eval_samples_per_second": 31.27,
"eval_steps_per_second": 3.94,
"step": 25920
},
{
"epoch": 7.2242289524868015,
"grad_norm": 0.2583458125591278,
"learning_rate": 0.00015928498657034364,
"loss": 0.0487,
"step": 26000
},
{
"epoch": 7.3631564323423175,
"grad_norm": 0.3225070536136627,
"learning_rate": 0.00015465407057515976,
"loss": 0.0485,
"step": 26500
},
{
"epoch": 7.402056126701861,
"eval_loss": 0.11151115596294403,
"eval_runtime": 15.9848,
"eval_samples_per_second": 31.28,
"eval_steps_per_second": 3.941,
"step": 26640
},
{
"epoch": 7.502083912197833,
"grad_norm": 0.4772126376628876,
"learning_rate": 0.00015002315457997593,
"loss": 0.0483,
"step": 27000
},
{
"epoch": 7.602111697693804,
"eval_loss": 0.11146976053714752,
"eval_runtime": 15.9333,
"eval_samples_per_second": 31.381,
"eval_steps_per_second": 3.954,
"step": 27360
},
{
"epoch": 7.641011392053348,
"grad_norm": 0.7825577855110168,
"learning_rate": 0.00014540150041678245,
"loss": 0.0478,
"step": 27500
},
{
"epoch": 7.779938871908864,
"grad_norm": 0.465191513299942,
"learning_rate": 0.0001407705844215986,
"loss": 0.0469,
"step": 28000
},
{
"epoch": 7.802167268685746,
"eval_loss": 0.11259140819311142,
"eval_runtime": 15.9752,
"eval_samples_per_second": 31.299,
"eval_steps_per_second": 3.944,
"step": 28080
},
{
"epoch": 7.918866351764379,
"grad_norm": 0.2701134979724884,
"learning_rate": 0.00013613966842641474,
"loss": 0.0443,
"step": 28500
},
{
"epoch": 8.002222839677689,
"eval_loss": 0.11337699741125107,
"eval_runtime": 16.0812,
"eval_samples_per_second": 31.092,
"eval_steps_per_second": 3.918,
"step": 28800
},
{
"epoch": 8.057793831619895,
"grad_norm": 0.3994615375995636,
"learning_rate": 0.00013151801426322128,
"loss": 0.044,
"step": 29000
},
{
"epoch": 8.19672131147541,
"grad_norm": 0.46412038803100586,
"learning_rate": 0.00012688709826803743,
"loss": 0.0421,
"step": 29500
},
{
"epoch": 8.20227841066963,
"eval_loss": 0.11495082080364227,
"eval_runtime": 15.9654,
"eval_samples_per_second": 31.318,
"eval_steps_per_second": 3.946,
"step": 29520
},
{
"epoch": 8.335648791330925,
"grad_norm": 0.3810461759567261,
"learning_rate": 0.00012225618227285358,
"loss": 0.0411,
"step": 30000
},
{
"epoch": 8.402333981661572,
"eval_loss": 0.11439384520053864,
"eval_runtime": 16.014,
"eval_samples_per_second": 31.223,
"eval_steps_per_second": 3.934,
"step": 30240
},
{
"epoch": 8.474576271186441,
"grad_norm": 0.4397641122341156,
"learning_rate": 0.00011762526627766973,
"loss": 0.0412,
"step": 30500
},
{
"epoch": 8.602389552653515,
"eval_loss": 0.11167102307081223,
"eval_runtime": 15.9143,
"eval_samples_per_second": 31.418,
"eval_steps_per_second": 3.959,
"step": 30960
},
{
"epoch": 8.613503751041955,
"grad_norm": 0.7023443579673767,
"learning_rate": 0.00011300361211447625,
"loss": 0.041,
"step": 31000
},
{
"epoch": 8.752431230897471,
"grad_norm": 0.5792316198348999,
"learning_rate": 0.00010837269611929239,
"loss": 0.0391,
"step": 31500
},
{
"epoch": 8.802445123645457,
"eval_loss": 0.11271476745605469,
"eval_runtime": 15.9859,
"eval_samples_per_second": 31.278,
"eval_steps_per_second": 3.941,
"step": 31680
},
{
"epoch": 8.891358710752987,
"grad_norm": 0.44151026010513306,
"learning_rate": 0.00010374178012410855,
"loss": 0.0403,
"step": 32000
},
{
"epoch": 9.0025006946374,
"eval_loss": 0.11616696417331696,
"eval_runtime": 16.1103,
"eval_samples_per_second": 31.036,
"eval_steps_per_second": 3.911,
"step": 32400
},
{
"epoch": 9.030286190608502,
"grad_norm": 0.3094378411769867,
"learning_rate": 9.911086412892471e-05,
"loss": 0.0386,
"step": 32500
},
{
"epoch": 9.169213670464018,
"grad_norm": 0.4858907163143158,
"learning_rate": 9.447994813374086e-05,
"loss": 0.0354,
"step": 33000
},
{
"epoch": 9.202556265629342,
"eval_loss": 0.11926531791687012,
"eval_runtime": 16.0468,
"eval_samples_per_second": 31.159,
"eval_steps_per_second": 3.926,
"step": 33120
},
{
"epoch": 9.308141150319534,
"grad_norm": 0.40236544609069824,
"learning_rate": 8.9849032138557e-05,
"loss": 0.0354,
"step": 33500
},
{
"epoch": 9.402611836621285,
"eval_loss": 0.12175790965557098,
"eval_runtime": 15.9794,
"eval_samples_per_second": 31.29,
"eval_steps_per_second": 3.943,
"step": 33840
},
{
"epoch": 9.447068630175048,
"grad_norm": 0.8239908218383789,
"learning_rate": 8.521811614337317e-05,
"loss": 0.035,
"step": 34000
},
{
"epoch": 9.585996110030564,
"grad_norm": 0.2754063308238983,
"learning_rate": 8.059646198017968e-05,
"loss": 0.0352,
"step": 34500
},
{
"epoch": 9.602667407613225,
"eval_loss": 0.11963404715061188,
"eval_runtime": 15.9949,
"eval_samples_per_second": 31.26,
"eval_steps_per_second": 3.939,
"step": 34560
},
{
"epoch": 9.72492358988608,
"grad_norm": 0.4275870621204376,
"learning_rate": 7.596554598499583e-05,
"loss": 0.0356,
"step": 35000
},
{
"epoch": 9.802722978605168,
"eval_loss": 0.12364204972982407,
"eval_runtime": 15.9291,
"eval_samples_per_second": 31.389,
"eval_steps_per_second": 3.955,
"step": 35280
},
{
"epoch": 9.863851069741594,
"grad_norm": 0.41111400723457336,
"learning_rate": 7.133462998981199e-05,
"loss": 0.0322,
"step": 35500
},
{
"epoch": 10.00277854959711,
"grad_norm": 0.4361058175563812,
"learning_rate": 6.67129758266185e-05,
"loss": 0.0331,
"step": 36000
},
{
"epoch": 10.00277854959711,
"eval_loss": 0.1233987957239151,
"eval_runtime": 16.0606,
"eval_samples_per_second": 31.132,
"eval_steps_per_second": 3.923,
"step": 36000
},
{
"epoch": 10.141706029452626,
"grad_norm": 0.3986058235168457,
"learning_rate": 6.208205983143465e-05,
"loss": 0.032,
"step": 36500
},
{
"epoch": 10.202834120589053,
"eval_loss": 0.12648221850395203,
"eval_runtime": 15.987,
"eval_samples_per_second": 31.275,
"eval_steps_per_second": 3.941,
"step": 36720
},
{
"epoch": 10.28063350930814,
"grad_norm": 0.1770099699497223,
"learning_rate": 5.745114383625081e-05,
"loss": 0.0302,
"step": 37000
},
{
"epoch": 10.402889691580995,
"eval_loss": 0.1288907825946808,
"eval_runtime": 15.9585,
"eval_samples_per_second": 31.331,
"eval_steps_per_second": 3.948,
"step": 37440
},
{
"epoch": 10.419560989163656,
"grad_norm": 0.33153316378593445,
"learning_rate": 5.282022784106696e-05,
"loss": 0.0299,
"step": 37500
},
{
"epoch": 10.558488469019172,
"grad_norm": 0.4955579340457916,
"learning_rate": 4.818931184588312e-05,
"loss": 0.0301,
"step": 38000
},
{
"epoch": 10.602945262572938,
"eval_loss": 0.1280115395784378,
"eval_runtime": 15.9981,
"eval_samples_per_second": 31.254,
"eval_steps_per_second": 3.938,
"step": 38160
},
{
"epoch": 10.697415948874687,
"grad_norm": 0.6109702587127686,
"learning_rate": 4.355839585069927e-05,
"loss": 0.0295,
"step": 38500
},
{
"epoch": 10.803000833564878,
"eval_loss": 0.12585216760635376,
"eval_runtime": 15.9876,
"eval_samples_per_second": 31.274,
"eval_steps_per_second": 3.941,
"step": 38880
},
{
"epoch": 10.836343428730203,
"grad_norm": 0.3070131242275238,
"learning_rate": 3.8927479855515425e-05,
"loss": 0.0298,
"step": 39000
},
{
"epoch": 10.975270908585719,
"grad_norm": 0.5334280133247375,
"learning_rate": 3.429656386033158e-05,
"loss": 0.028,
"step": 39500
}
],
"logging_steps": 500,
"max_steps": 43188,
"num_input_tokens_seen": 0,
"num_train_epochs": 12,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.419404232849097e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}