lombardata's picture
Training in progress, epoch 1
c0efc49 verified
raw
history blame
36.1 kB
{
"best_metric": 0.466439425945282,
"best_model_checkpoint": "/home/datawork-iot-nos/Seatizen/models/multilabel/drone/drone-DinoVdeau-from-binary-large-2024_11_14-batch-size16_freeze_probs/checkpoint-22776",
"epoch": 62.0,
"eval_steps": 500,
"global_step": 27156,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"eval_explained_variance": 0.3631434440612793,
"eval_kl_divergence": 0.421912282705307,
"eval_loss": 0.4821413457393646,
"eval_mae": 0.13084472715854645,
"eval_rmse": 0.173090398311615,
"eval_runtime": 64.2475,
"eval_samples_per_second": 36.64,
"eval_steps_per_second": 2.304,
"learning_rate": 0.001,
"step": 438
},
{
"epoch": 1.1415525114155252,
"grad_norm": 0.4757365882396698,
"learning_rate": 0.001,
"loss": 0.5317,
"step": 500
},
{
"epoch": 2.0,
"eval_explained_variance": 0.3752269744873047,
"eval_kl_divergence": 0.6148446202278137,
"eval_loss": 0.4784533977508545,
"eval_mae": 0.12629373371601105,
"eval_rmse": 0.17098082602024078,
"eval_runtime": 63.6833,
"eval_samples_per_second": 36.964,
"eval_steps_per_second": 2.324,
"learning_rate": 0.001,
"step": 876
},
{
"epoch": 2.2831050228310503,
"grad_norm": 0.4254082143306732,
"learning_rate": 0.001,
"loss": 0.4832,
"step": 1000
},
{
"epoch": 3.0,
"eval_explained_variance": 0.3838556706905365,
"eval_kl_divergence": 0.48802560567855835,
"eval_loss": 0.47776785492897034,
"eval_mae": 0.12731628119945526,
"eval_rmse": 0.16985835134983063,
"eval_runtime": 62.637,
"eval_samples_per_second": 37.582,
"eval_steps_per_second": 2.363,
"learning_rate": 0.001,
"step": 1314
},
{
"epoch": 3.4246575342465753,
"grad_norm": 0.2670271098613739,
"learning_rate": 0.001,
"loss": 0.4791,
"step": 1500
},
{
"epoch": 4.0,
"eval_explained_variance": 0.38376739621162415,
"eval_kl_divergence": 0.3418101370334625,
"eval_loss": 0.4793245792388916,
"eval_mae": 0.12901858985424042,
"eval_rmse": 0.171015664935112,
"eval_runtime": 62.2828,
"eval_samples_per_second": 37.795,
"eval_steps_per_second": 2.376,
"learning_rate": 0.001,
"step": 1752
},
{
"epoch": 4.566210045662101,
"grad_norm": 0.20498104393482208,
"learning_rate": 0.001,
"loss": 0.4771,
"step": 2000
},
{
"epoch": 5.0,
"eval_explained_variance": 0.40547776222229004,
"eval_kl_divergence": 0.34562820196151733,
"eval_loss": 0.47521594166755676,
"eval_mae": 0.12799377739429474,
"eval_rmse": 0.16736441850662231,
"eval_runtime": 62.7606,
"eval_samples_per_second": 37.508,
"eval_steps_per_second": 2.358,
"learning_rate": 0.001,
"step": 2190
},
{
"epoch": 5.707762557077626,
"grad_norm": 0.24335043132305145,
"learning_rate": 0.001,
"loss": 0.4752,
"step": 2500
},
{
"epoch": 6.0,
"eval_explained_variance": 0.3849389851093292,
"eval_kl_divergence": 0.6402714848518372,
"eval_loss": 0.478865385055542,
"eval_mae": 0.12540282309055328,
"eval_rmse": 0.17068879306316376,
"eval_runtime": 63.4836,
"eval_samples_per_second": 37.08,
"eval_steps_per_second": 2.331,
"learning_rate": 0.001,
"step": 2628
},
{
"epoch": 6.8493150684931505,
"grad_norm": 0.18768365681171417,
"learning_rate": 0.001,
"loss": 0.4752,
"step": 3000
},
{
"epoch": 7.0,
"eval_explained_variance": 0.3788411617279053,
"eval_kl_divergence": 0.5491646528244019,
"eval_loss": 0.4779475927352905,
"eval_mae": 0.12878474593162537,
"eval_rmse": 0.17091502249240875,
"eval_runtime": 63.4904,
"eval_samples_per_second": 37.076,
"eval_steps_per_second": 2.331,
"learning_rate": 0.001,
"step": 3066
},
{
"epoch": 7.9908675799086755,
"grad_norm": 0.1587909311056137,
"learning_rate": 0.001,
"loss": 0.4735,
"step": 3500
},
{
"epoch": 8.0,
"eval_explained_variance": 0.40857037901878357,
"eval_kl_divergence": 0.33827269077301025,
"eval_loss": 0.4756968021392822,
"eval_mae": 0.12695902585983276,
"eval_rmse": 0.16784566640853882,
"eval_runtime": 64.8792,
"eval_samples_per_second": 36.283,
"eval_steps_per_second": 2.281,
"learning_rate": 0.001,
"step": 3504
},
{
"epoch": 9.0,
"eval_explained_variance": 0.4111548960208893,
"eval_kl_divergence": 0.5417521595954895,
"eval_loss": 0.4731782376766205,
"eval_mae": 0.12311580032110214,
"eval_rmse": 0.1657222956418991,
"eval_runtime": 61.0673,
"eval_samples_per_second": 38.548,
"eval_steps_per_second": 2.424,
"learning_rate": 0.001,
"step": 3942
},
{
"epoch": 9.132420091324201,
"grad_norm": 0.1892658919095993,
"learning_rate": 0.001,
"loss": 0.4719,
"step": 4000
},
{
"epoch": 10.0,
"eval_explained_variance": 0.4006313979625702,
"eval_kl_divergence": 0.15472176671028137,
"eval_loss": 0.4799855649471283,
"eval_mae": 0.1320570707321167,
"eval_rmse": 0.1722680777311325,
"eval_runtime": 62.7974,
"eval_samples_per_second": 37.486,
"eval_steps_per_second": 2.357,
"learning_rate": 0.001,
"step": 4380
},
{
"epoch": 10.273972602739725,
"grad_norm": 0.20271532237529755,
"learning_rate": 0.001,
"loss": 0.4727,
"step": 4500
},
{
"epoch": 11.0,
"eval_explained_variance": 0.41580215096473694,
"eval_kl_divergence": 0.3436921238899231,
"eval_loss": 0.4731641411781311,
"eval_mae": 0.12562014162540436,
"eval_rmse": 0.16564464569091797,
"eval_runtime": 62.757,
"eval_samples_per_second": 37.51,
"eval_steps_per_second": 2.358,
"learning_rate": 0.001,
"step": 4818
},
{
"epoch": 11.415525114155251,
"grad_norm": 0.15571434795856476,
"learning_rate": 0.001,
"loss": 0.4723,
"step": 5000
},
{
"epoch": 12.0,
"eval_explained_variance": 0.401115745306015,
"eval_kl_divergence": 0.2946830093860626,
"eval_loss": 0.47767141461372375,
"eval_mae": 0.12927968800067902,
"eval_rmse": 0.17007046937942505,
"eval_runtime": 62.3684,
"eval_samples_per_second": 37.743,
"eval_steps_per_second": 2.373,
"learning_rate": 0.001,
"step": 5256
},
{
"epoch": 12.557077625570777,
"grad_norm": 0.16043365001678467,
"learning_rate": 0.001,
"loss": 0.4735,
"step": 5500
},
{
"epoch": 13.0,
"eval_explained_variance": 0.39704158902168274,
"eval_kl_divergence": 0.6136478781700134,
"eval_loss": 0.48009705543518066,
"eval_mae": 0.12475714087486267,
"eval_rmse": 0.16773907840251923,
"eval_runtime": 64.7917,
"eval_samples_per_second": 36.332,
"eval_steps_per_second": 2.284,
"learning_rate": 0.001,
"step": 5694
},
{
"epoch": 13.698630136986301,
"grad_norm": 0.13616104423999786,
"learning_rate": 0.001,
"loss": 0.4728,
"step": 6000
},
{
"epoch": 14.0,
"eval_explained_variance": 0.40355002880096436,
"eval_kl_divergence": Infinity,
"eval_loss": 0.4954195022583008,
"eval_mae": 0.12534154951572418,
"eval_rmse": 0.16692323982715607,
"eval_runtime": 62.1801,
"eval_samples_per_second": 37.858,
"eval_steps_per_second": 2.38,
"learning_rate": 0.001,
"step": 6132
},
{
"epoch": 14.840182648401827,
"grad_norm": 0.12133222818374634,
"learning_rate": 0.001,
"loss": 0.4713,
"step": 6500
},
{
"epoch": 15.0,
"eval_explained_variance": 0.4051372706890106,
"eval_kl_divergence": Infinity,
"eval_loss": 0.4812238812446594,
"eval_mae": 0.12540575861930847,
"eval_rmse": 0.16624794900417328,
"eval_runtime": 61.3206,
"eval_samples_per_second": 38.388,
"eval_steps_per_second": 2.414,
"learning_rate": 0.001,
"step": 6570
},
{
"epoch": 15.981735159817351,
"grad_norm": 0.11760278791189194,
"learning_rate": 0.0001,
"loss": 0.4706,
"step": 7000
},
{
"epoch": 16.0,
"eval_explained_variance": 0.41243478655815125,
"eval_kl_divergence": Infinity,
"eval_loss": 0.4858487546443939,
"eval_mae": 0.12432911992073059,
"eval_rmse": 0.16562338173389435,
"eval_runtime": 61.1501,
"eval_samples_per_second": 38.495,
"eval_steps_per_second": 2.42,
"learning_rate": 0.0001,
"step": 7008
},
{
"epoch": 17.0,
"eval_explained_variance": 0.4291960895061493,
"eval_kl_divergence": 0.41650328040122986,
"eval_loss": 0.47084349393844604,
"eval_mae": 0.12233477830886841,
"eval_rmse": 0.162751242518425,
"eval_runtime": 60.5736,
"eval_samples_per_second": 38.862,
"eval_steps_per_second": 2.443,
"learning_rate": 0.0001,
"step": 7446
},
{
"epoch": 17.123287671232877,
"grad_norm": 0.13284093141555786,
"learning_rate": 0.0001,
"loss": 0.4672,
"step": 7500
},
{
"epoch": 18.0,
"eval_explained_variance": 0.43114474415779114,
"eval_kl_divergence": 0.4066373407840729,
"eval_loss": 0.4707622528076172,
"eval_mae": 0.12164173275232315,
"eval_rmse": 0.16261519491672516,
"eval_runtime": 63.1248,
"eval_samples_per_second": 37.291,
"eval_steps_per_second": 2.345,
"learning_rate": 0.0001,
"step": 7884
},
{
"epoch": 18.264840182648403,
"grad_norm": 0.11859569698572159,
"learning_rate": 0.0001,
"loss": 0.4659,
"step": 8000
},
{
"epoch": 19.0,
"eval_explained_variance": 0.4342735707759857,
"eval_kl_divergence": 0.31854644417762756,
"eval_loss": 0.47095733880996704,
"eval_mae": 0.12272538989782333,
"eval_rmse": 0.16323107481002808,
"eval_runtime": 61.7089,
"eval_samples_per_second": 38.147,
"eval_steps_per_second": 2.398,
"learning_rate": 0.0001,
"step": 8322
},
{
"epoch": 19.40639269406393,
"grad_norm": 0.16951066255569458,
"learning_rate": 0.0001,
"loss": 0.4653,
"step": 8500
},
{
"epoch": 20.0,
"eval_explained_variance": 0.43487218022346497,
"eval_kl_divergence": 0.465139240026474,
"eval_loss": 0.4696938395500183,
"eval_mae": 0.12050192803144455,
"eval_rmse": 0.1620241105556488,
"eval_runtime": 61.0162,
"eval_samples_per_second": 38.58,
"eval_steps_per_second": 2.426,
"learning_rate": 0.0001,
"step": 8760
},
{
"epoch": 20.54794520547945,
"grad_norm": 0.167369082570076,
"learning_rate": 0.0001,
"loss": 0.4653,
"step": 9000
},
{
"epoch": 21.0,
"eval_explained_variance": 0.4389828145503998,
"eval_kl_divergence": 0.3772728741168976,
"eval_loss": 0.46922874450683594,
"eval_mae": 0.12155676633119583,
"eval_rmse": 0.16139467060565948,
"eval_runtime": 62.17,
"eval_samples_per_second": 37.864,
"eval_steps_per_second": 2.381,
"learning_rate": 0.0001,
"step": 9198
},
{
"epoch": 21.689497716894977,
"grad_norm": 0.1247042864561081,
"learning_rate": 0.0001,
"loss": 0.4659,
"step": 9500
},
{
"epoch": 22.0,
"eval_explained_variance": 0.43975934386253357,
"eval_kl_divergence": 0.4611187279224396,
"eval_loss": 0.4685634672641754,
"eval_mae": 0.1203194335103035,
"eval_rmse": 0.16088876128196716,
"eval_runtime": 62.0052,
"eval_samples_per_second": 37.965,
"eval_steps_per_second": 2.387,
"learning_rate": 0.0001,
"step": 9636
},
{
"epoch": 22.831050228310502,
"grad_norm": 0.16208066046237946,
"learning_rate": 0.0001,
"loss": 0.465,
"step": 10000
},
{
"epoch": 23.0,
"eval_explained_variance": 0.44279029965400696,
"eval_kl_divergence": 0.24986685812473297,
"eval_loss": 0.47018975019454956,
"eval_mae": 0.12256480008363724,
"eval_rmse": 0.16208301484584808,
"eval_runtime": 61.6543,
"eval_samples_per_second": 38.181,
"eval_steps_per_second": 2.4,
"learning_rate": 0.0001,
"step": 10074
},
{
"epoch": 23.972602739726028,
"grad_norm": 0.17417912185192108,
"learning_rate": 0.0001,
"loss": 0.4633,
"step": 10500
},
{
"epoch": 24.0,
"eval_explained_variance": 0.4367590844631195,
"eval_kl_divergence": 0.3702172040939331,
"eval_loss": 0.4705464243888855,
"eval_mae": 0.12131566554307938,
"eval_rmse": 0.16277877986431122,
"eval_runtime": 62.8273,
"eval_samples_per_second": 37.468,
"eval_steps_per_second": 2.356,
"learning_rate": 0.0001,
"step": 10512
},
{
"epoch": 25.0,
"eval_explained_variance": 0.4433206617832184,
"eval_kl_divergence": 0.5132729411125183,
"eval_loss": 0.4678299129009247,
"eval_mae": 0.11875440925359726,
"eval_rmse": 0.16013289988040924,
"eval_runtime": 61.7077,
"eval_samples_per_second": 38.148,
"eval_steps_per_second": 2.398,
"learning_rate": 0.0001,
"step": 10950
},
{
"epoch": 25.114155251141554,
"grad_norm": 0.13617579638957977,
"learning_rate": 0.0001,
"loss": 0.4656,
"step": 11000
},
{
"epoch": 26.0,
"eval_explained_variance": 0.4423791468143463,
"eval_kl_divergence": 0.5665323138237,
"eval_loss": 0.46802961826324463,
"eval_mae": 0.117874376475811,
"eval_rmse": 0.1604483276605606,
"eval_runtime": 61.9639,
"eval_samples_per_second": 37.99,
"eval_steps_per_second": 2.388,
"learning_rate": 0.0001,
"step": 11388
},
{
"epoch": 26.255707762557076,
"grad_norm": 0.15818916261196136,
"learning_rate": 0.0001,
"loss": 0.4629,
"step": 11500
},
{
"epoch": 27.0,
"eval_explained_variance": 0.4434410333633423,
"eval_kl_divergence": 0.42424070835113525,
"eval_loss": 0.4680938124656677,
"eval_mae": 0.1199984923005104,
"eval_rmse": 0.16038183867931366,
"eval_runtime": 62.3144,
"eval_samples_per_second": 37.776,
"eval_steps_per_second": 2.375,
"learning_rate": 0.0001,
"step": 11826
},
{
"epoch": 27.397260273972602,
"grad_norm": 0.15971983969211578,
"learning_rate": 0.0001,
"loss": 0.4636,
"step": 12000
},
{
"epoch": 28.0,
"eval_explained_variance": 0.44512465596199036,
"eval_kl_divergence": 0.2967982292175293,
"eval_loss": 0.4693257212638855,
"eval_mae": 0.12149528414011002,
"eval_rmse": 0.1616295725107193,
"eval_runtime": 66.789,
"eval_samples_per_second": 35.245,
"eval_steps_per_second": 2.216,
"learning_rate": 0.0001,
"step": 12264
},
{
"epoch": 28.538812785388128,
"grad_norm": 0.15448875725269318,
"learning_rate": 0.0001,
"loss": 0.4633,
"step": 12500
},
{
"epoch": 29.0,
"eval_explained_variance": 0.4442717730998993,
"eval_kl_divergence": 0.3924856185913086,
"eval_loss": 0.46847742795944214,
"eval_mae": 0.1196620985865593,
"eval_rmse": 0.16072382032871246,
"eval_runtime": 61.9086,
"eval_samples_per_second": 38.024,
"eval_steps_per_second": 2.391,
"learning_rate": 0.0001,
"step": 12702
},
{
"epoch": 29.680365296803654,
"grad_norm": 0.15532433986663818,
"learning_rate": 0.0001,
"loss": 0.4631,
"step": 13000
},
{
"epoch": 30.0,
"eval_explained_variance": 0.4473068416118622,
"eval_kl_divergence": 0.2495478093624115,
"eval_loss": 0.46944141387939453,
"eval_mae": 0.12209376692771912,
"eval_rmse": 0.16142255067825317,
"eval_runtime": 62.4285,
"eval_samples_per_second": 37.707,
"eval_steps_per_second": 2.371,
"learning_rate": 0.0001,
"step": 13140
},
{
"epoch": 30.82191780821918,
"grad_norm": 0.1961052566766739,
"learning_rate": 0.0001,
"loss": 0.463,
"step": 13500
},
{
"epoch": 31.0,
"eval_explained_variance": 0.4445982277393341,
"eval_kl_divergence": 0.45099732279777527,
"eval_loss": 0.4678958058357239,
"eval_mae": 0.11854251474142075,
"eval_rmse": 0.16011421382427216,
"eval_runtime": 61.3729,
"eval_samples_per_second": 38.356,
"eval_steps_per_second": 2.411,
"learning_rate": 0.0001,
"step": 13578
},
{
"epoch": 31.963470319634702,
"grad_norm": 0.3346303701400757,
"learning_rate": 1e-05,
"loss": 0.4623,
"step": 14000
},
{
"epoch": 32.0,
"eval_explained_variance": 0.4478188455104828,
"eval_kl_divergence": 0.3885524570941925,
"eval_loss": 0.46778997778892517,
"eval_mae": 0.11933697015047073,
"eval_rmse": 0.16006481647491455,
"eval_runtime": 63.8544,
"eval_samples_per_second": 36.865,
"eval_steps_per_second": 2.318,
"learning_rate": 1e-05,
"step": 14016
},
{
"epoch": 33.0,
"eval_explained_variance": 0.44756200909614563,
"eval_kl_divergence": 0.31322383880615234,
"eval_loss": 0.4686955511569977,
"eval_mae": 0.1201881393790245,
"eval_rmse": 0.16055406630039215,
"eval_runtime": 62.7334,
"eval_samples_per_second": 37.524,
"eval_steps_per_second": 2.359,
"learning_rate": 1e-05,
"step": 14454
},
{
"epoch": 33.10502283105023,
"grad_norm": 0.21087272465229034,
"learning_rate": 1e-05,
"loss": 0.4621,
"step": 14500
},
{
"epoch": 34.0,
"eval_explained_variance": 0.4478868544101715,
"eval_kl_divergence": 0.3957745432853699,
"eval_loss": 0.46784329414367676,
"eval_mae": 0.11951460689306259,
"eval_rmse": 0.1600986272096634,
"eval_runtime": 60.6174,
"eval_samples_per_second": 38.834,
"eval_steps_per_second": 2.442,
"learning_rate": 1e-05,
"step": 14892
},
{
"epoch": 34.24657534246575,
"grad_norm": 0.1875353455543518,
"learning_rate": 1e-05,
"loss": 0.4607,
"step": 15000
},
{
"epoch": 35.0,
"eval_explained_variance": 0.44849491119384766,
"eval_kl_divergence": 0.45786312222480774,
"eval_loss": 0.4671097695827484,
"eval_mae": 0.11800643056631088,
"eval_rmse": 0.15947793424129486,
"eval_runtime": 61.7609,
"eval_samples_per_second": 38.115,
"eval_steps_per_second": 2.396,
"learning_rate": 1e-05,
"step": 15330
},
{
"epoch": 35.38812785388128,
"grad_norm": 0.16752338409423828,
"learning_rate": 1e-05,
"loss": 0.4619,
"step": 15500
},
{
"epoch": 36.0,
"eval_explained_variance": 0.44863569736480713,
"eval_kl_divergence": 0.43913933634757996,
"eval_loss": 0.46735426783561707,
"eval_mae": 0.11842861026525497,
"eval_rmse": 0.15950414538383484,
"eval_runtime": 64.5571,
"eval_samples_per_second": 36.464,
"eval_steps_per_second": 2.293,
"learning_rate": 1e-05,
"step": 15768
},
{
"epoch": 36.529680365296805,
"grad_norm": 0.15660376846790314,
"learning_rate": 1e-05,
"loss": 0.4612,
"step": 16000
},
{
"epoch": 37.0,
"eval_explained_variance": 0.44943228363990784,
"eval_kl_divergence": 0.36332887411117554,
"eval_loss": 0.468018501996994,
"eval_mae": 0.11912700533866882,
"eval_rmse": 0.16002707183361053,
"eval_runtime": 63.2971,
"eval_samples_per_second": 37.19,
"eval_steps_per_second": 2.338,
"learning_rate": 1e-05,
"step": 16206
},
{
"epoch": 37.67123287671233,
"grad_norm": 0.15865331888198853,
"learning_rate": 1e-05,
"loss": 0.4625,
"step": 16500
},
{
"epoch": 38.0,
"eval_explained_variance": 0.45025742053985596,
"eval_kl_divergence": 0.43029093742370605,
"eval_loss": 0.46701580286026,
"eval_mae": 0.1186341941356659,
"eval_rmse": 0.15923398733139038,
"eval_runtime": 63.0229,
"eval_samples_per_second": 37.351,
"eval_steps_per_second": 2.348,
"learning_rate": 1e-05,
"step": 16644
},
{
"epoch": 38.81278538812786,
"grad_norm": 0.2913804352283478,
"learning_rate": 1e-05,
"loss": 0.4608,
"step": 17000
},
{
"epoch": 39.0,
"eval_explained_variance": 0.448851078748703,
"eval_kl_divergence": 0.4562166929244995,
"eval_loss": 0.4673251509666443,
"eval_mae": 0.11870113760232925,
"eval_rmse": 0.1596096307039261,
"eval_runtime": 63.132,
"eval_samples_per_second": 37.287,
"eval_steps_per_second": 2.344,
"learning_rate": 1e-05,
"step": 17082
},
{
"epoch": 39.954337899543376,
"grad_norm": 0.1813182234764099,
"learning_rate": 1e-05,
"loss": 0.4614,
"step": 17500
},
{
"epoch": 40.0,
"eval_explained_variance": 0.449774831533432,
"eval_kl_divergence": 0.40653547644615173,
"eval_loss": 0.4673212468624115,
"eval_mae": 0.1188703179359436,
"eval_rmse": 0.15939703583717346,
"eval_runtime": 65.2215,
"eval_samples_per_second": 36.092,
"eval_steps_per_second": 2.269,
"learning_rate": 1e-05,
"step": 17520
},
{
"epoch": 41.0,
"eval_explained_variance": 0.4507579803466797,
"eval_kl_divergence": 0.3335873782634735,
"eval_loss": 0.4677547216415405,
"eval_mae": 0.12059084326028824,
"eval_rmse": 0.159872904419899,
"eval_runtime": 65.9882,
"eval_samples_per_second": 35.673,
"eval_steps_per_second": 2.243,
"learning_rate": 1e-05,
"step": 17958
},
{
"epoch": 41.0958904109589,
"grad_norm": 0.1584874391555786,
"learning_rate": 1e-05,
"loss": 0.4608,
"step": 18000
},
{
"epoch": 42.0,
"eval_explained_variance": 0.4486294686794281,
"eval_kl_divergence": 0.5311685800552368,
"eval_loss": 0.4671882390975952,
"eval_mae": 0.1177595853805542,
"eval_rmse": 0.15967120230197906,
"eval_runtime": 65.4501,
"eval_samples_per_second": 35.966,
"eval_steps_per_second": 2.261,
"learning_rate": 1e-05,
"step": 18396
},
{
"epoch": 42.23744292237443,
"grad_norm": 0.17140232026576996,
"learning_rate": 1e-05,
"loss": 0.4615,
"step": 18500
},
{
"epoch": 43.0,
"eval_explained_variance": 0.45157137513160706,
"eval_kl_divergence": 0.3923657536506653,
"eval_loss": 0.46716412901878357,
"eval_mae": 0.1185157299041748,
"eval_rmse": 0.1592295914888382,
"eval_runtime": 64.3671,
"eval_samples_per_second": 36.571,
"eval_steps_per_second": 2.299,
"learning_rate": 1e-05,
"step": 18834
},
{
"epoch": 43.37899543378995,
"grad_norm": 0.12803754210472107,
"learning_rate": 1e-05,
"loss": 0.4601,
"step": 19000
},
{
"epoch": 44.0,
"eval_explained_variance": 0.44912728667259216,
"eval_kl_divergence": 0.4258858561515808,
"eval_loss": 0.4678168296813965,
"eval_mae": 0.11944716423749924,
"eval_rmse": 0.16020986437797546,
"eval_runtime": 65.6519,
"eval_samples_per_second": 35.856,
"eval_steps_per_second": 2.254,
"learning_rate": 1e-05,
"step": 19272
},
{
"epoch": 44.52054794520548,
"grad_norm": 0.12536858022212982,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4602,
"step": 19500
},
{
"epoch": 45.0,
"eval_explained_variance": 0.4489940404891968,
"eval_kl_divergence": 0.5214298367500305,
"eval_loss": 0.46699702739715576,
"eval_mae": 0.11719372868537903,
"eval_rmse": 0.15936775505542755,
"eval_runtime": 64.8181,
"eval_samples_per_second": 36.317,
"eval_steps_per_second": 2.283,
"learning_rate": 1.0000000000000002e-06,
"step": 19710
},
{
"epoch": 45.662100456621005,
"grad_norm": 0.12503549456596375,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4616,
"step": 20000
},
{
"epoch": 46.0,
"eval_explained_variance": 0.45176592469215393,
"eval_kl_divergence": 0.4174787700176239,
"eval_loss": 0.46712958812713623,
"eval_mae": 0.11880326271057129,
"eval_rmse": 0.1593877524137497,
"eval_runtime": 64.134,
"eval_samples_per_second": 36.704,
"eval_steps_per_second": 2.308,
"learning_rate": 1.0000000000000002e-06,
"step": 20148
},
{
"epoch": 46.80365296803653,
"grad_norm": 0.1746779829263687,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4602,
"step": 20500
},
{
"epoch": 47.0,
"eval_explained_variance": 0.4524901807308197,
"eval_kl_divergence": 0.4446321427822113,
"eval_loss": 0.4666382074356079,
"eval_mae": 0.11884639412164688,
"eval_rmse": 0.15886224806308746,
"eval_runtime": 68.911,
"eval_samples_per_second": 34.16,
"eval_steps_per_second": 2.148,
"learning_rate": 1.0000000000000002e-06,
"step": 20586
},
{
"epoch": 47.945205479452056,
"grad_norm": 0.18253998458385468,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4604,
"step": 21000
},
{
"epoch": 48.0,
"eval_explained_variance": 0.44860827922821045,
"eval_kl_divergence": 0.5755118727684021,
"eval_loss": 0.46714723110198975,
"eval_mae": 0.11802936345338821,
"eval_rmse": 0.15972274541854858,
"eval_runtime": 68.5695,
"eval_samples_per_second": 34.33,
"eval_steps_per_second": 2.158,
"learning_rate": 1.0000000000000002e-06,
"step": 21024
},
{
"epoch": 49.0,
"eval_explained_variance": 0.4494647979736328,
"eval_kl_divergence": 0.4303589463233948,
"eval_loss": 0.46758702397346497,
"eval_mae": 0.11922705173492432,
"eval_rmse": 0.15995512902736664,
"eval_runtime": 68.4997,
"eval_samples_per_second": 34.365,
"eval_steps_per_second": 2.161,
"learning_rate": 1.0000000000000002e-06,
"step": 21462
},
{
"epoch": 49.08675799086758,
"grad_norm": 0.1836538463830948,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4606,
"step": 21500
},
{
"epoch": 50.0,
"eval_explained_variance": 0.4534037411212921,
"eval_kl_divergence": 0.33374354243278503,
"eval_loss": 0.46752068400382996,
"eval_mae": 0.12040751427412033,
"eval_rmse": 0.15945331752300262,
"eval_runtime": 67.7842,
"eval_samples_per_second": 34.728,
"eval_steps_per_second": 2.183,
"learning_rate": 1.0000000000000002e-06,
"step": 21900
},
{
"epoch": 50.22831050228311,
"grad_norm": 0.18452928960323334,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4598,
"step": 22000
},
{
"epoch": 51.0,
"eval_explained_variance": 0.4523892402648926,
"eval_kl_divergence": 0.395465224981308,
"eval_loss": 0.46691644191741943,
"eval_mae": 0.11809410899877548,
"eval_rmse": 0.1590944528579712,
"eval_runtime": 68.2629,
"eval_samples_per_second": 34.484,
"eval_steps_per_second": 2.168,
"learning_rate": 1.0000000000000002e-06,
"step": 22338
},
{
"epoch": 51.36986301369863,
"grad_norm": 0.1816985160112381,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4602,
"step": 22500
},
{
"epoch": 52.0,
"eval_explained_variance": 0.45300889015197754,
"eval_kl_divergence": 0.4761090576648712,
"eval_loss": 0.466439425945282,
"eval_mae": 0.1174706444144249,
"eval_rmse": 0.15875311195850372,
"eval_runtime": 68.2396,
"eval_samples_per_second": 34.496,
"eval_steps_per_second": 2.169,
"learning_rate": 1.0000000000000002e-06,
"step": 22776
},
{
"epoch": 52.51141552511415,
"grad_norm": 0.17806819081306458,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.462,
"step": 23000
},
{
"epoch": 53.0,
"eval_explained_variance": 0.45259252190589905,
"eval_kl_divergence": 0.43274176120758057,
"eval_loss": 0.4667709469795227,
"eval_mae": 0.11889918893575668,
"eval_rmse": 0.15901200473308563,
"eval_runtime": 66.8799,
"eval_samples_per_second": 35.197,
"eval_steps_per_second": 2.213,
"learning_rate": 1.0000000000000002e-06,
"step": 23214
},
{
"epoch": 53.65296803652968,
"grad_norm": 0.18054644763469696,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4604,
"step": 23500
},
{
"epoch": 54.0,
"eval_explained_variance": 0.4532507658004761,
"eval_kl_divergence": 0.3724806606769562,
"eval_loss": 0.46701404452323914,
"eval_mae": 0.11868719011545181,
"eval_rmse": 0.15923155844211578,
"eval_runtime": 73.556,
"eval_samples_per_second": 32.003,
"eval_steps_per_second": 2.012,
"learning_rate": 1.0000000000000002e-06,
"step": 23652
},
{
"epoch": 54.794520547945204,
"grad_norm": 0.26471829414367676,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.461,
"step": 24000
},
{
"epoch": 55.0,
"eval_explained_variance": 0.45088374614715576,
"eval_kl_divergence": 0.38409897685050964,
"eval_loss": 0.467383474111557,
"eval_mae": 0.11990005522966385,
"eval_rmse": 0.1595049947500229,
"eval_runtime": 70.451,
"eval_samples_per_second": 33.413,
"eval_steps_per_second": 2.101,
"learning_rate": 1.0000000000000002e-06,
"step": 24090
},
{
"epoch": 55.93607305936073,
"grad_norm": 0.2783886194229126,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4599,
"step": 24500
},
{
"epoch": 56.0,
"eval_explained_variance": 0.45115411281585693,
"eval_kl_divergence": 0.3821828067302704,
"eval_loss": 0.46739572286605835,
"eval_mae": 0.11897724121809006,
"eval_rmse": 0.15964223444461823,
"eval_runtime": 69.6578,
"eval_samples_per_second": 33.794,
"eval_steps_per_second": 2.125,
"learning_rate": 1.0000000000000002e-06,
"step": 24528
},
{
"epoch": 57.0,
"eval_explained_variance": 0.4505263864994049,
"eval_kl_divergence": 0.4674541652202606,
"eval_loss": 0.46702033281326294,
"eval_mae": 0.1185864806175232,
"eval_rmse": 0.15932416915893555,
"eval_runtime": 67.4689,
"eval_samples_per_second": 34.89,
"eval_steps_per_second": 2.194,
"learning_rate": 1.0000000000000002e-06,
"step": 24966
},
{
"epoch": 57.077625570776256,
"grad_norm": 0.16562320291996002,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4594,
"step": 25000
},
{
"epoch": 58.0,
"eval_explained_variance": 0.4521506726741791,
"eval_kl_divergence": 0.37376847863197327,
"eval_loss": 0.46735846996307373,
"eval_mae": 0.11891353130340576,
"eval_rmse": 0.15956538915634155,
"eval_runtime": 68.6492,
"eval_samples_per_second": 34.29,
"eval_steps_per_second": 2.156,
"learning_rate": 1.0000000000000002e-06,
"step": 25404
},
{
"epoch": 58.21917808219178,
"grad_norm": 0.21171259880065918,
"learning_rate": 1.0000000000000002e-07,
"loss": 0.4613,
"step": 25500
},
{
"epoch": 59.0,
"eval_explained_variance": 0.45357391238212585,
"eval_kl_divergence": 0.4204346239566803,
"eval_loss": 0.46666717529296875,
"eval_mae": 0.11845538765192032,
"eval_rmse": 0.1589372605085373,
"eval_runtime": 69.2012,
"eval_samples_per_second": 34.017,
"eval_steps_per_second": 2.139,
"learning_rate": 1.0000000000000002e-07,
"step": 25842
},
{
"epoch": 59.36073059360731,
"grad_norm": 0.1960112601518631,
"learning_rate": 1.0000000000000002e-07,
"loss": 0.4607,
"step": 26000
},
{
"epoch": 60.0,
"eval_explained_variance": 0.4513193368911743,
"eval_kl_divergence": 0.45320600271224976,
"eval_loss": 0.46685320138931274,
"eval_mae": 0.11779770255088806,
"eval_rmse": 0.15917657315731049,
"eval_runtime": 71.4331,
"eval_samples_per_second": 32.954,
"eval_steps_per_second": 2.072,
"learning_rate": 1.0000000000000002e-07,
"step": 26280
},
{
"epoch": 60.50228310502283,
"grad_norm": 0.2178792506456375,
"learning_rate": 1.0000000000000002e-07,
"loss": 0.4613,
"step": 26500
},
{
"epoch": 61.0,
"eval_explained_variance": 0.45110437273979187,
"eval_kl_divergence": 0.40322577953338623,
"eval_loss": 0.46734780073165894,
"eval_mae": 0.11893540620803833,
"eval_rmse": 0.1595635712146759,
"eval_runtime": 69.3534,
"eval_samples_per_second": 33.942,
"eval_steps_per_second": 2.134,
"learning_rate": 1.0000000000000002e-07,
"step": 26718
},
{
"epoch": 61.64383561643836,
"grad_norm": 0.16740958392620087,
"learning_rate": 1.0000000000000002e-07,
"loss": 0.4598,
"step": 27000
},
{
"epoch": 62.0,
"eval_explained_variance": 0.4526772201061249,
"eval_kl_divergence": 0.3406714200973511,
"eval_loss": 0.4673011302947998,
"eval_mae": 0.11888447403907776,
"eval_rmse": 0.1594574898481369,
"eval_runtime": 70.4024,
"eval_samples_per_second": 33.436,
"eval_steps_per_second": 2.102,
"learning_rate": 1.0000000000000002e-07,
"step": 27156
},
{
"epoch": 62.0,
"learning_rate": 1.0000000000000002e-07,
"step": 27156,
"total_flos": 6.42634409963284e+19,
"train_loss": 0.466335079458891,
"train_runtime": 17194.6751,
"train_samples_per_second": 61.092,
"train_steps_per_second": 3.821
}
],
"logging_steps": 500,
"max_steps": 65700,
"num_input_tokens_seen": 0,
"num_train_epochs": 150,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.42634409963284e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}