|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 122720, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04074315514993481, |
|
"grad_norm": 1.5828578472137451, |
|
"learning_rate": 0.0002987777053455019, |
|
"loss": 1.0838, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08148631029986962, |
|
"grad_norm": 2.0423190593719482, |
|
"learning_rate": 0.0002975554106910039, |
|
"loss": 0.8724, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12222946544980444, |
|
"grad_norm": 2.746290683746338, |
|
"learning_rate": 0.00029633311603650584, |
|
"loss": 0.7804, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.16297262059973924, |
|
"grad_norm": 2.9885292053222656, |
|
"learning_rate": 0.0002951108213820078, |
|
"loss": 0.7367, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.20371577574967406, |
|
"grad_norm": 2.7808330059051514, |
|
"learning_rate": 0.00029388852672750977, |
|
"loss": 0.7016, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.24445893089960888, |
|
"grad_norm": 2.4769978523254395, |
|
"learning_rate": 0.0002926662320730117, |
|
"loss": 0.6759, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.28520208604954367, |
|
"grad_norm": 2.141312599182129, |
|
"learning_rate": 0.00029144393741851364, |
|
"loss": 0.6623, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.3259452411994785, |
|
"grad_norm": 2.169989824295044, |
|
"learning_rate": 0.0002902216427640156, |
|
"loss": 0.6352, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3666883963494133, |
|
"grad_norm": 3.238466262817383, |
|
"learning_rate": 0.0002889993481095176, |
|
"loss": 0.6382, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4074315514993481, |
|
"grad_norm": 2.6361501216888428, |
|
"learning_rate": 0.00028777705345501956, |
|
"loss": 0.6168, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.44817470664928294, |
|
"grad_norm": 2.7179198265075684, |
|
"learning_rate": 0.0002865547588005215, |
|
"loss": 0.609, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.48891786179921776, |
|
"grad_norm": 1.717613935470581, |
|
"learning_rate": 0.00028533246414602344, |
|
"loss": 0.6023, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5296610169491526, |
|
"grad_norm": 1.5747934579849243, |
|
"learning_rate": 0.00028411016949152543, |
|
"loss": 0.5995, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5704041720990873, |
|
"grad_norm": 2.5513648986816406, |
|
"learning_rate": 0.00028288787483702737, |
|
"loss": 0.5817, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6111473272490222, |
|
"grad_norm": 2.6356778144836426, |
|
"learning_rate": 0.0002816655801825293, |
|
"loss": 0.5816, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.651890482398957, |
|
"grad_norm": 1.4905149936676025, |
|
"learning_rate": 0.00028044328552803124, |
|
"loss": 0.587, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6926336375488917, |
|
"grad_norm": 2.126027822494507, |
|
"learning_rate": 0.00027922099087353323, |
|
"loss": 0.5763, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7333767926988266, |
|
"grad_norm": 1.642021656036377, |
|
"learning_rate": 0.00027799869621903517, |
|
"loss": 0.5788, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7741199478487614, |
|
"grad_norm": 2.919405460357666, |
|
"learning_rate": 0.0002767764015645371, |
|
"loss": 0.5756, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.8148631029986962, |
|
"grad_norm": 2.7401773929595947, |
|
"learning_rate": 0.0002755541069100391, |
|
"loss": 0.561, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.855606258148631, |
|
"grad_norm": 3.268413543701172, |
|
"learning_rate": 0.00027433181225554104, |
|
"loss": 0.5627, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.8963494132985659, |
|
"grad_norm": 1.6420167684555054, |
|
"learning_rate": 0.000273109517601043, |
|
"loss": 0.5591, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9370925684485006, |
|
"grad_norm": 3.6072638034820557, |
|
"learning_rate": 0.00027188722294654497, |
|
"loss": 0.5593, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.9778357235984355, |
|
"grad_norm": 1.9567480087280273, |
|
"learning_rate": 0.0002706649282920469, |
|
"loss": 0.5593, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.7959839105606079, |
|
"eval_loss": 0.5131802558898926, |
|
"eval_runtime": 3.8597, |
|
"eval_samples_per_second": 645.123, |
|
"eval_steps_per_second": 80.835, |
|
"step": 12272 |
|
}, |
|
{ |
|
"epoch": 1.0185788787483703, |
|
"grad_norm": 1.4843103885650635, |
|
"learning_rate": 0.00026944263363754884, |
|
"loss": 0.5502, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.0593220338983051, |
|
"grad_norm": 2.488762855529785, |
|
"learning_rate": 0.00026822033898305083, |
|
"loss": 0.5391, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.1000651890482398, |
|
"grad_norm": 2.1324970722198486, |
|
"learning_rate": 0.00026699804432855277, |
|
"loss": 0.543, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.1408083441981747, |
|
"grad_norm": 1.2764211893081665, |
|
"learning_rate": 0.00026577574967405476, |
|
"loss": 0.5397, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.1815514993481095, |
|
"grad_norm": 1.6294676065444946, |
|
"learning_rate": 0.0002645534550195567, |
|
"loss": 0.5487, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.2222946544980444, |
|
"grad_norm": 1.9771702289581299, |
|
"learning_rate": 0.00026333116036505864, |
|
"loss": 0.5365, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.263037809647979, |
|
"grad_norm": 2.3716676235198975, |
|
"learning_rate": 0.00026210886571056063, |
|
"loss": 0.5367, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.303780964797914, |
|
"grad_norm": 3.3372445106506348, |
|
"learning_rate": 0.00026088657105606257, |
|
"loss": 0.5339, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.3445241199478488, |
|
"grad_norm": 1.6658248901367188, |
|
"learning_rate": 0.0002596642764015645, |
|
"loss": 0.5306, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.3852672750977835, |
|
"grad_norm": 3.4499964714050293, |
|
"learning_rate": 0.0002584419817470665, |
|
"loss": 0.5337, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.4260104302477183, |
|
"grad_norm": 1.6458457708358765, |
|
"learning_rate": 0.00025721968709256843, |
|
"loss": 0.5288, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.4667535853976532, |
|
"grad_norm": 1.9066393375396729, |
|
"learning_rate": 0.00025599739243807037, |
|
"loss": 0.5255, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.5074967405475879, |
|
"grad_norm": 1.735753059387207, |
|
"learning_rate": 0.0002547750977835723, |
|
"loss": 0.5282, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.548239895697523, |
|
"grad_norm": 1.7421000003814697, |
|
"learning_rate": 0.0002535528031290743, |
|
"loss": 0.5194, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.5889830508474576, |
|
"grad_norm": 2.0979628562927246, |
|
"learning_rate": 0.00025233050847457624, |
|
"loss": 0.5334, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.6297262059973925, |
|
"grad_norm": 1.8151721954345703, |
|
"learning_rate": 0.0002511082138200782, |
|
"loss": 0.5283, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.6704693611473274, |
|
"grad_norm": 1.5404752492904663, |
|
"learning_rate": 0.00024988591916558017, |
|
"loss": 0.5232, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.711212516297262, |
|
"grad_norm": 3.021263837814331, |
|
"learning_rate": 0.0002486636245110821, |
|
"loss": 0.5184, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.7519556714471969, |
|
"grad_norm": 3.665238380432129, |
|
"learning_rate": 0.00024744132985658404, |
|
"loss": 0.5166, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.7926988265971318, |
|
"grad_norm": 1.4015684127807617, |
|
"learning_rate": 0.00024621903520208603, |
|
"loss": 0.5158, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.8334419817470664, |
|
"grad_norm": 3.794595241546631, |
|
"learning_rate": 0.000244996740547588, |
|
"loss": 0.5267, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.8741851368970013, |
|
"grad_norm": 1.9950809478759766, |
|
"learning_rate": 0.00024377444589308996, |
|
"loss": 0.5119, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.9149282920469362, |
|
"grad_norm": 1.5925782918930054, |
|
"learning_rate": 0.0002425521512385919, |
|
"loss": 0.5203, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.9556714471968708, |
|
"grad_norm": 2.9431841373443604, |
|
"learning_rate": 0.00024132985658409386, |
|
"loss": 0.512, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.996414602346806, |
|
"grad_norm": 1.6975939273834229, |
|
"learning_rate": 0.00024010756192959583, |
|
"loss": 0.5079, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.811646580696106, |
|
"eval_loss": 0.4724164307117462, |
|
"eval_runtime": 3.8494, |
|
"eval_samples_per_second": 646.846, |
|
"eval_steps_per_second": 81.051, |
|
"step": 24544 |
|
}, |
|
{ |
|
"epoch": 2.0371577574967406, |
|
"grad_norm": 2.4534783363342285, |
|
"learning_rate": 0.00023888526727509777, |
|
"loss": 0.5027, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.077900912646675, |
|
"grad_norm": 2.0405538082122803, |
|
"learning_rate": 0.00023766297262059973, |
|
"loss": 0.5008, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.1186440677966103, |
|
"grad_norm": 1.833534836769104, |
|
"learning_rate": 0.00023644067796610167, |
|
"loss": 0.5043, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.159387222946545, |
|
"grad_norm": 1.8968788385391235, |
|
"learning_rate": 0.00023521838331160363, |
|
"loss": 0.5018, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.2001303780964796, |
|
"grad_norm": 1.6663825511932373, |
|
"learning_rate": 0.0002339960886571056, |
|
"loss": 0.501, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.2408735332464147, |
|
"grad_norm": 2.6435630321502686, |
|
"learning_rate": 0.00023277379400260753, |
|
"loss": 0.4976, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.2816166883963493, |
|
"grad_norm": 1.5958212614059448, |
|
"learning_rate": 0.0002315514993481095, |
|
"loss": 0.5056, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.322359843546284, |
|
"grad_norm": 1.644378423690796, |
|
"learning_rate": 0.00023032920469361144, |
|
"loss": 0.4856, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.363102998696219, |
|
"grad_norm": 1.9557281732559204, |
|
"learning_rate": 0.0002291069100391134, |
|
"loss": 0.4943, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.4038461538461537, |
|
"grad_norm": 3.529120922088623, |
|
"learning_rate": 0.00022788461538461537, |
|
"loss": 0.4956, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 2.444589308996089, |
|
"grad_norm": 2.105900287628174, |
|
"learning_rate": 0.0002266623207301173, |
|
"loss": 0.5032, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.4853324641460235, |
|
"grad_norm": 2.1510558128356934, |
|
"learning_rate": 0.00022544002607561927, |
|
"loss": 0.5014, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 2.526075619295958, |
|
"grad_norm": 2.0106310844421387, |
|
"learning_rate": 0.0002242177314211212, |
|
"loss": 0.5047, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.5668187744458932, |
|
"grad_norm": 3.261289358139038, |
|
"learning_rate": 0.0002229954367666232, |
|
"loss": 0.5044, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 2.607561929595828, |
|
"grad_norm": 1.9700287580490112, |
|
"learning_rate": 0.00022177314211212516, |
|
"loss": 0.4921, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.648305084745763, |
|
"grad_norm": 1.7623224258422852, |
|
"learning_rate": 0.0002205508474576271, |
|
"loss": 0.493, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.6890482398956976, |
|
"grad_norm": 2.0307579040527344, |
|
"learning_rate": 0.00021932855280312906, |
|
"loss": 0.4944, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.7297913950456323, |
|
"grad_norm": 1.6054081916809082, |
|
"learning_rate": 0.00021810625814863103, |
|
"loss": 0.495, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.770534550195567, |
|
"grad_norm": 1.2301048040390015, |
|
"learning_rate": 0.00021688396349413296, |
|
"loss": 0.5006, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.811277705345502, |
|
"grad_norm": 2.173492670059204, |
|
"learning_rate": 0.00021566166883963493, |
|
"loss": 0.4952, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.8520208604954367, |
|
"grad_norm": 1.8370041847229004, |
|
"learning_rate": 0.0002144393741851369, |
|
"loss": 0.4774, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.8927640156453718, |
|
"grad_norm": 2.2364652156829834, |
|
"learning_rate": 0.00021321707953063883, |
|
"loss": 0.4848, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.9335071707953064, |
|
"grad_norm": 1.6861385107040405, |
|
"learning_rate": 0.0002119947848761408, |
|
"loss": 0.4982, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.974250325945241, |
|
"grad_norm": 1.938073992729187, |
|
"learning_rate": 0.00021077249022164273, |
|
"loss": 0.4815, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7899598479270935, |
|
"eval_loss": 0.5340293049812317, |
|
"eval_runtime": 3.8273, |
|
"eval_samples_per_second": 650.596, |
|
"eval_steps_per_second": 81.521, |
|
"step": 36816 |
|
}, |
|
{ |
|
"epoch": 3.014993481095176, |
|
"grad_norm": 1.7496318817138672, |
|
"learning_rate": 0.0002095501955671447, |
|
"loss": 0.4926, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.055736636245111, |
|
"grad_norm": 1.6960176229476929, |
|
"learning_rate": 0.00020832790091264666, |
|
"loss": 0.4807, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.0964797913950455, |
|
"grad_norm": 1.9474419355392456, |
|
"learning_rate": 0.0002071056062581486, |
|
"loss": 0.4753, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 3.1372229465449806, |
|
"grad_norm": 2.938944101333618, |
|
"learning_rate": 0.00020588331160365056, |
|
"loss": 0.4869, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 3.1779661016949152, |
|
"grad_norm": 1.9578561782836914, |
|
"learning_rate": 0.0002046610169491525, |
|
"loss": 0.473, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 3.21870925684485, |
|
"grad_norm": 2.039128541946411, |
|
"learning_rate": 0.00020343872229465447, |
|
"loss": 0.4744, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 3.259452411994785, |
|
"grad_norm": 3.681124687194824, |
|
"learning_rate": 0.00020221642764015643, |
|
"loss": 0.4799, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 3.3001955671447196, |
|
"grad_norm": 1.5929746627807617, |
|
"learning_rate": 0.0002009941329856584, |
|
"loss": 0.4787, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 3.3409387222946547, |
|
"grad_norm": 1.7738901376724243, |
|
"learning_rate": 0.00019977183833116036, |
|
"loss": 0.4721, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 3.3816818774445894, |
|
"grad_norm": 1.670168161392212, |
|
"learning_rate": 0.00019854954367666232, |
|
"loss": 0.4842, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 3.422425032594524, |
|
"grad_norm": 2.07709002494812, |
|
"learning_rate": 0.00019732724902216426, |
|
"loss": 0.4676, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 3.463168187744459, |
|
"grad_norm": 1.76602041721344, |
|
"learning_rate": 0.00019610495436766623, |
|
"loss": 0.478, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 3.5039113428943938, |
|
"grad_norm": 1.7546826601028442, |
|
"learning_rate": 0.00019488265971316816, |
|
"loss": 0.481, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 3.5446544980443284, |
|
"grad_norm": 1.9292327165603638, |
|
"learning_rate": 0.00019366036505867013, |
|
"loss": 0.4763, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 3.5853976531942635, |
|
"grad_norm": 2.211700916290283, |
|
"learning_rate": 0.0001924380704041721, |
|
"loss": 0.4736, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 3.626140808344198, |
|
"grad_norm": 3.4379701614379883, |
|
"learning_rate": 0.00019121577574967403, |
|
"loss": 0.4691, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 3.666883963494133, |
|
"grad_norm": 3.3809897899627686, |
|
"learning_rate": 0.000189993481095176, |
|
"loss": 0.4865, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 3.707627118644068, |
|
"grad_norm": 2.958820343017578, |
|
"learning_rate": 0.00018877118644067796, |
|
"loss": 0.4773, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 3.7483702737940026, |
|
"grad_norm": 2.081221342086792, |
|
"learning_rate": 0.0001875488917861799, |
|
"loss": 0.4643, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 3.7891134289439377, |
|
"grad_norm": 1.3643114566802979, |
|
"learning_rate": 0.00018632659713168186, |
|
"loss": 0.4615, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 3.8298565840938723, |
|
"grad_norm": 2.669874668121338, |
|
"learning_rate": 0.0001851043024771838, |
|
"loss": 0.4786, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 3.870599739243807, |
|
"grad_norm": 1.4289213418960571, |
|
"learning_rate": 0.00018388200782268576, |
|
"loss": 0.4796, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 3.9113428943937416, |
|
"grad_norm": 2.1719110012054443, |
|
"learning_rate": 0.00018265971316818773, |
|
"loss": 0.4798, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.9520860495436767, |
|
"grad_norm": 1.5655969381332397, |
|
"learning_rate": 0.00018143741851368966, |
|
"loss": 0.4587, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 3.9928292046936114, |
|
"grad_norm": 2.242460250854492, |
|
"learning_rate": 0.00018021512385919163, |
|
"loss": 0.4757, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.823293149471283, |
|
"eval_loss": 0.4503098726272583, |
|
"eval_runtime": 3.7594, |
|
"eval_samples_per_second": 662.343, |
|
"eval_steps_per_second": 82.992, |
|
"step": 49088 |
|
}, |
|
{ |
|
"epoch": 4.0335723598435465, |
|
"grad_norm": 1.2779546976089478, |
|
"learning_rate": 0.00017899282920469362, |
|
"loss": 0.4646, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 4.074315514993481, |
|
"grad_norm": 2.8832106590270996, |
|
"learning_rate": 0.00017777053455019556, |
|
"loss": 0.4535, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 4.115058670143416, |
|
"grad_norm": 2.621349811553955, |
|
"learning_rate": 0.00017654823989569752, |
|
"loss": 0.466, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 4.15580182529335, |
|
"grad_norm": 2.7702085971832275, |
|
"learning_rate": 0.00017532594524119946, |
|
"loss": 0.4535, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 4.196544980443286, |
|
"grad_norm": 2.0533089637756348, |
|
"learning_rate": 0.00017410365058670142, |
|
"loss": 0.4539, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 4.237288135593221, |
|
"grad_norm": 2.3479714393615723, |
|
"learning_rate": 0.0001728813559322034, |
|
"loss": 0.4536, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 4.278031290743155, |
|
"grad_norm": 2.5870485305786133, |
|
"learning_rate": 0.00017165906127770533, |
|
"loss": 0.4586, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 4.31877444589309, |
|
"grad_norm": 2.1694679260253906, |
|
"learning_rate": 0.0001704367666232073, |
|
"loss": 0.468, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 4.3595176010430245, |
|
"grad_norm": 2.3384058475494385, |
|
"learning_rate": 0.00016921447196870926, |
|
"loss": 0.4629, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 4.400260756192959, |
|
"grad_norm": 1.984429121017456, |
|
"learning_rate": 0.0001679921773142112, |
|
"loss": 0.4609, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 4.441003911342895, |
|
"grad_norm": 2.1723473072052, |
|
"learning_rate": 0.00016676988265971316, |
|
"loss": 0.4569, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 4.481747066492829, |
|
"grad_norm": 1.6641265153884888, |
|
"learning_rate": 0.0001655475880052151, |
|
"loss": 0.4604, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 4.522490221642764, |
|
"grad_norm": 2.4631197452545166, |
|
"learning_rate": 0.00016432529335071706, |
|
"loss": 0.4534, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 4.563233376792699, |
|
"grad_norm": 1.8890092372894287, |
|
"learning_rate": 0.00016310299869621902, |
|
"loss": 0.4594, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 4.603976531942633, |
|
"grad_norm": 2.240000009536743, |
|
"learning_rate": 0.00016188070404172096, |
|
"loss": 0.459, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 4.644719687092568, |
|
"grad_norm": 2.3564798831939697, |
|
"learning_rate": 0.00016065840938722293, |
|
"loss": 0.4644, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 4.6854628422425035, |
|
"grad_norm": 2.3006091117858887, |
|
"learning_rate": 0.00015943611473272486, |
|
"loss": 0.4671, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 4.726205997392438, |
|
"grad_norm": 2.5902099609375, |
|
"learning_rate": 0.00015821382007822685, |
|
"loss": 0.4574, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 4.766949152542373, |
|
"grad_norm": 2.838531970977783, |
|
"learning_rate": 0.00015699152542372882, |
|
"loss": 0.4579, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 4.8076923076923075, |
|
"grad_norm": 2.0663444995880127, |
|
"learning_rate": 0.00015576923076923076, |
|
"loss": 0.475, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 4.848435462842242, |
|
"grad_norm": 2.3260273933410645, |
|
"learning_rate": 0.00015454693611473272, |
|
"loss": 0.4564, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 4.889178617992178, |
|
"grad_norm": 1.6334234476089478, |
|
"learning_rate": 0.00015332464146023469, |
|
"loss": 0.4582, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 4.929921773142112, |
|
"grad_norm": 3.73071026802063, |
|
"learning_rate": 0.00015210234680573662, |
|
"loss": 0.462, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 4.970664928292047, |
|
"grad_norm": 2.0201449394226074, |
|
"learning_rate": 0.0001508800521512386, |
|
"loss": 0.4575, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.8321285247802734, |
|
"eval_loss": 0.43152591586112976, |
|
"eval_runtime": 3.7584, |
|
"eval_samples_per_second": 662.509, |
|
"eval_steps_per_second": 83.013, |
|
"step": 61360 |
|
}, |
|
{ |
|
"epoch": 5.011408083441982, |
|
"grad_norm": 3.175994396209717, |
|
"learning_rate": 0.00014965775749674052, |
|
"loss": 0.4644, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 5.052151238591916, |
|
"grad_norm": 2.7221341133117676, |
|
"learning_rate": 0.0001484354628422425, |
|
"loss": 0.4479, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 5.092894393741851, |
|
"grad_norm": 2.5768377780914307, |
|
"learning_rate": 0.00014721316818774445, |
|
"loss": 0.4388, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 5.1336375488917865, |
|
"grad_norm": 2.2102932929992676, |
|
"learning_rate": 0.0001459908735332464, |
|
"loss": 0.4455, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 5.174380704041721, |
|
"grad_norm": 2.931208372116089, |
|
"learning_rate": 0.00014476857887874836, |
|
"loss": 0.4491, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 5.215123859191656, |
|
"grad_norm": 1.4483258724212646, |
|
"learning_rate": 0.00014354628422425032, |
|
"loss": 0.4412, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 5.25586701434159, |
|
"grad_norm": 1.9341588020324707, |
|
"learning_rate": 0.00014232398956975226, |
|
"loss": 0.4485, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 5.296610169491525, |
|
"grad_norm": 2.019665241241455, |
|
"learning_rate": 0.00014110169491525422, |
|
"loss": 0.443, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 5.337353324641461, |
|
"grad_norm": 2.324504852294922, |
|
"learning_rate": 0.0001398794002607562, |
|
"loss": 0.4574, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 5.378096479791395, |
|
"grad_norm": 1.5774611234664917, |
|
"learning_rate": 0.00013865710560625815, |
|
"loss": 0.4433, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 5.41883963494133, |
|
"grad_norm": 2.7634074687957764, |
|
"learning_rate": 0.0001374348109517601, |
|
"loss": 0.439, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 5.459582790091265, |
|
"grad_norm": 3.11040997505188, |
|
"learning_rate": 0.00013621251629726205, |
|
"loss": 0.461, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 5.500325945241199, |
|
"grad_norm": 1.974280834197998, |
|
"learning_rate": 0.000134990221642764, |
|
"loss": 0.4529, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 5.541069100391134, |
|
"grad_norm": 2.044983386993408, |
|
"learning_rate": 0.00013376792698826596, |
|
"loss": 0.4569, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 5.581812255541069, |
|
"grad_norm": 3.2112531661987305, |
|
"learning_rate": 0.00013254563233376792, |
|
"loss": 0.4464, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 5.622555410691004, |
|
"grad_norm": 3.1195149421691895, |
|
"learning_rate": 0.00013132333767926986, |
|
"loss": 0.4479, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 5.663298565840939, |
|
"grad_norm": 1.791577935218811, |
|
"learning_rate": 0.00013010104302477182, |
|
"loss": 0.4409, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 5.704041720990873, |
|
"grad_norm": 3.3514599800109863, |
|
"learning_rate": 0.00012887874837027379, |
|
"loss": 0.4477, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 5.744784876140808, |
|
"grad_norm": 1.6975388526916504, |
|
"learning_rate": 0.00012765645371577575, |
|
"loss": 0.4539, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 5.7855280312907436, |
|
"grad_norm": 1.7184616327285767, |
|
"learning_rate": 0.0001264341590612777, |
|
"loss": 0.4538, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 5.826271186440678, |
|
"grad_norm": 2.1258814334869385, |
|
"learning_rate": 0.00012521186440677965, |
|
"loss": 0.4477, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 5.867014341590613, |
|
"grad_norm": 3.146895170211792, |
|
"learning_rate": 0.0001239895697522816, |
|
"loss": 0.4393, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 5.9077574967405475, |
|
"grad_norm": 3.5936505794525146, |
|
"learning_rate": 0.00012276727509778355, |
|
"loss": 0.4496, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 5.948500651890482, |
|
"grad_norm": 2.3087146282196045, |
|
"learning_rate": 0.0001215449804432855, |
|
"loss": 0.4469, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 5.989243807040417, |
|
"grad_norm": 1.9528945684432983, |
|
"learning_rate": 0.00012032268578878747, |
|
"loss": 0.4435, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.8369477987289429, |
|
"eval_loss": 0.4312504529953003, |
|
"eval_runtime": 3.772, |
|
"eval_samples_per_second": 660.131, |
|
"eval_steps_per_second": 82.715, |
|
"step": 73632 |
|
}, |
|
{ |
|
"epoch": 6.029986962190352, |
|
"grad_norm": 2.099862813949585, |
|
"learning_rate": 0.00011910039113428943, |
|
"loss": 0.4439, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 6.070730117340287, |
|
"grad_norm": 1.8885704278945923, |
|
"learning_rate": 0.00011787809647979139, |
|
"loss": 0.4367, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 6.111473272490222, |
|
"grad_norm": 2.2419934272766113, |
|
"learning_rate": 0.00011665580182529335, |
|
"loss": 0.4308, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 6.152216427640156, |
|
"grad_norm": 1.9617372751235962, |
|
"learning_rate": 0.0001154335071707953, |
|
"loss": 0.4325, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 6.192959582790091, |
|
"grad_norm": 2.785778522491455, |
|
"learning_rate": 0.00011421121251629725, |
|
"loss": 0.4363, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 6.2337027379400265, |
|
"grad_norm": 1.9203062057495117, |
|
"learning_rate": 0.0001129889178617992, |
|
"loss": 0.444, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 6.274445893089961, |
|
"grad_norm": 2.095256805419922, |
|
"learning_rate": 0.00011176662320730115, |
|
"loss": 0.4443, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 6.315189048239896, |
|
"grad_norm": 3.069437265396118, |
|
"learning_rate": 0.00011054432855280312, |
|
"loss": 0.4275, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 6.3559322033898304, |
|
"grad_norm": 2.62670636177063, |
|
"learning_rate": 0.00010932203389830507, |
|
"loss": 0.4411, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 6.396675358539765, |
|
"grad_norm": 3.285296678543091, |
|
"learning_rate": 0.00010809973924380703, |
|
"loss": 0.4512, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 6.4374185136897, |
|
"grad_norm": 1.5842539072036743, |
|
"learning_rate": 0.00010687744458930898, |
|
"loss": 0.4358, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 6.478161668839635, |
|
"grad_norm": 2.1644914150238037, |
|
"learning_rate": 0.00010565514993481095, |
|
"loss": 0.4418, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 6.51890482398957, |
|
"grad_norm": 2.471137523651123, |
|
"learning_rate": 0.0001044328552803129, |
|
"loss": 0.4356, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 6.559647979139505, |
|
"grad_norm": 2.335623025894165, |
|
"learning_rate": 0.00010321056062581485, |
|
"loss": 0.4406, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 6.600391134289439, |
|
"grad_norm": 4.2139081954956055, |
|
"learning_rate": 0.0001019882659713168, |
|
"loss": 0.4333, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 6.641134289439374, |
|
"grad_norm": 2.9831790924072266, |
|
"learning_rate": 0.00010076597131681877, |
|
"loss": 0.4409, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 6.681877444589309, |
|
"grad_norm": 1.6623711585998535, |
|
"learning_rate": 9.954367666232072e-05, |
|
"loss": 0.4375, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 6.722620599739244, |
|
"grad_norm": 2.8634488582611084, |
|
"learning_rate": 9.832138200782268e-05, |
|
"loss": 0.4288, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 6.763363754889179, |
|
"grad_norm": 3.4033591747283936, |
|
"learning_rate": 9.709908735332463e-05, |
|
"loss": 0.4357, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 6.804106910039113, |
|
"grad_norm": 2.3420987129211426, |
|
"learning_rate": 9.58767926988266e-05, |
|
"loss": 0.4443, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 6.844850065189048, |
|
"grad_norm": 2.943692684173584, |
|
"learning_rate": 9.465449804432855e-05, |
|
"loss": 0.4233, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 6.885593220338983, |
|
"grad_norm": 1.5020798444747925, |
|
"learning_rate": 9.34322033898305e-05, |
|
"loss": 0.4299, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 6.926336375488918, |
|
"grad_norm": 1.8622304201126099, |
|
"learning_rate": 9.220990873533245e-05, |
|
"loss": 0.4253, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 6.967079530638853, |
|
"grad_norm": 3.1434714794158936, |
|
"learning_rate": 9.098761408083442e-05, |
|
"loss": 0.4228, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.8309236764907837, |
|
"eval_loss": 0.43813642859458923, |
|
"eval_runtime": 3.7707, |
|
"eval_samples_per_second": 660.357, |
|
"eval_steps_per_second": 82.743, |
|
"step": 85904 |
|
}, |
|
{ |
|
"epoch": 7.0078226857887875, |
|
"grad_norm": 2.7635395526885986, |
|
"learning_rate": 8.976531942633637e-05, |
|
"loss": 0.4382, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 7.048565840938722, |
|
"grad_norm": 2.407377004623413, |
|
"learning_rate": 8.854302477183832e-05, |
|
"loss": 0.4138, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 7.089308996088657, |
|
"grad_norm": 2.3589298725128174, |
|
"learning_rate": 8.732073011734028e-05, |
|
"loss": 0.4307, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 7.130052151238592, |
|
"grad_norm": 2.2621631622314453, |
|
"learning_rate": 8.609843546284225e-05, |
|
"loss": 0.4195, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 7.170795306388527, |
|
"grad_norm": 2.3238487243652344, |
|
"learning_rate": 8.48761408083442e-05, |
|
"loss": 0.4251, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 7.211538461538462, |
|
"grad_norm": 1.8608829975128174, |
|
"learning_rate": 8.365384615384615e-05, |
|
"loss": 0.4318, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 7.252281616688396, |
|
"grad_norm": 1.5675103664398193, |
|
"learning_rate": 8.24315514993481e-05, |
|
"loss": 0.43, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 7.293024771838331, |
|
"grad_norm": 3.2853474617004395, |
|
"learning_rate": 8.120925684485006e-05, |
|
"loss": 0.4336, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 7.333767926988266, |
|
"grad_norm": 2.1077964305877686, |
|
"learning_rate": 7.998696219035201e-05, |
|
"loss": 0.4304, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 7.374511082138201, |
|
"grad_norm": 2.3823015689849854, |
|
"learning_rate": 7.876466753585397e-05, |
|
"loss": 0.4269, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 7.415254237288136, |
|
"grad_norm": 2.7820513248443604, |
|
"learning_rate": 7.754237288135592e-05, |
|
"loss": 0.4152, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 7.4559973924380705, |
|
"grad_norm": 1.7721502780914307, |
|
"learning_rate": 7.63200782268579e-05, |
|
"loss": 0.4286, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 7.496740547588005, |
|
"grad_norm": 1.8405511379241943, |
|
"learning_rate": 7.509778357235985e-05, |
|
"loss": 0.4298, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 7.53748370273794, |
|
"grad_norm": 1.8865771293640137, |
|
"learning_rate": 7.387548891786178e-05, |
|
"loss": 0.4285, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 7.578226857887875, |
|
"grad_norm": 3.2232296466827393, |
|
"learning_rate": 7.265319426336375e-05, |
|
"loss": 0.4262, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 7.61897001303781, |
|
"grad_norm": 2.0036380290985107, |
|
"learning_rate": 7.14308996088657e-05, |
|
"loss": 0.4351, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 7.659713168187745, |
|
"grad_norm": 2.9521334171295166, |
|
"learning_rate": 7.020860495436766e-05, |
|
"loss": 0.4169, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 7.700456323337679, |
|
"grad_norm": 2.6853766441345215, |
|
"learning_rate": 6.898631029986961e-05, |
|
"loss": 0.4318, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 7.741199478487614, |
|
"grad_norm": 2.0400960445404053, |
|
"learning_rate": 6.776401564537158e-05, |
|
"loss": 0.4353, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 7.781942633637549, |
|
"grad_norm": 3.423992156982422, |
|
"learning_rate": 6.654172099087353e-05, |
|
"loss": 0.4236, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 7.822685788787483, |
|
"grad_norm": 2.914910078048706, |
|
"learning_rate": 6.531942633637548e-05, |
|
"loss": 0.4246, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 7.863428943937419, |
|
"grad_norm": 2.891737699508667, |
|
"learning_rate": 6.409713168187743e-05, |
|
"loss": 0.4278, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 7.904172099087353, |
|
"grad_norm": 4.279940605163574, |
|
"learning_rate": 6.28748370273794e-05, |
|
"loss": 0.4281, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 7.944915254237288, |
|
"grad_norm": 3.536177158355713, |
|
"learning_rate": 6.165254237288135e-05, |
|
"loss": 0.4283, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 7.985658409387223, |
|
"grad_norm": 2.03955078125, |
|
"learning_rate": 6.0430247718383304e-05, |
|
"loss": 0.4342, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.8365461826324463, |
|
"eval_loss": 0.4348722994327545, |
|
"eval_runtime": 3.765, |
|
"eval_samples_per_second": 661.356, |
|
"eval_steps_per_second": 82.869, |
|
"step": 98176 |
|
}, |
|
{ |
|
"epoch": 8.026401564537158, |
|
"grad_norm": 3.3217365741729736, |
|
"learning_rate": 5.920795306388526e-05, |
|
"loss": 0.4183, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 8.067144719687093, |
|
"grad_norm": 2.7104313373565674, |
|
"learning_rate": 5.798565840938721e-05, |
|
"loss": 0.4115, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 8.107887874837028, |
|
"grad_norm": 1.903761386871338, |
|
"learning_rate": 5.676336375488918e-05, |
|
"loss": 0.4142, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 8.148631029986962, |
|
"grad_norm": 3.378157615661621, |
|
"learning_rate": 5.554106910039113e-05, |
|
"loss": 0.4248, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 8.189374185136897, |
|
"grad_norm": 2.3883683681488037, |
|
"learning_rate": 5.4318774445893086e-05, |
|
"loss": 0.4153, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 8.230117340286832, |
|
"grad_norm": 3.1941487789154053, |
|
"learning_rate": 5.309647979139504e-05, |
|
"loss": 0.4202, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 8.270860495436766, |
|
"grad_norm": 2.556144952774048, |
|
"learning_rate": 5.1874185136897e-05, |
|
"loss": 0.4212, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 8.3116036505867, |
|
"grad_norm": 2.3176252841949463, |
|
"learning_rate": 5.065189048239895e-05, |
|
"loss": 0.4153, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 8.352346805736635, |
|
"grad_norm": 2.065124750137329, |
|
"learning_rate": 4.942959582790091e-05, |
|
"loss": 0.4213, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 8.393089960886572, |
|
"grad_norm": 2.1476380825042725, |
|
"learning_rate": 4.820730117340286e-05, |
|
"loss": 0.4282, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 8.433833116036507, |
|
"grad_norm": 2.8716719150543213, |
|
"learning_rate": 4.698500651890482e-05, |
|
"loss": 0.4185, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 8.474576271186441, |
|
"grad_norm": 2.4064836502075195, |
|
"learning_rate": 4.576271186440678e-05, |
|
"loss": 0.4197, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 8.515319426336376, |
|
"grad_norm": 2.5717904567718506, |
|
"learning_rate": 4.4540417209908735e-05, |
|
"loss": 0.4259, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 8.55606258148631, |
|
"grad_norm": 3.6525187492370605, |
|
"learning_rate": 4.3318122555410686e-05, |
|
"loss": 0.4256, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 8.596805736636245, |
|
"grad_norm": 2.679851531982422, |
|
"learning_rate": 4.2095827900912643e-05, |
|
"loss": 0.4168, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 8.63754889178618, |
|
"grad_norm": 2.0390915870666504, |
|
"learning_rate": 4.08735332464146e-05, |
|
"loss": 0.4158, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 8.678292046936114, |
|
"grad_norm": 3.6200144290924072, |
|
"learning_rate": 3.965123859191656e-05, |
|
"loss": 0.426, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 8.719035202086049, |
|
"grad_norm": 2.4385690689086914, |
|
"learning_rate": 3.842894393741851e-05, |
|
"loss": 0.4243, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 8.759778357235984, |
|
"grad_norm": 3.2193117141723633, |
|
"learning_rate": 3.720664928292047e-05, |
|
"loss": 0.4155, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 8.800521512385918, |
|
"grad_norm": 3.227754831314087, |
|
"learning_rate": 3.5984354628422425e-05, |
|
"loss": 0.4224, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 8.841264667535853, |
|
"grad_norm": 1.8869520425796509, |
|
"learning_rate": 3.4762059973924376e-05, |
|
"loss": 0.4178, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 8.88200782268579, |
|
"grad_norm": 3.0674214363098145, |
|
"learning_rate": 3.3539765319426334e-05, |
|
"loss": 0.4216, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 8.922750977835724, |
|
"grad_norm": 2.215930461883545, |
|
"learning_rate": 3.2317470664928285e-05, |
|
"loss": 0.4205, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 8.963494132985659, |
|
"grad_norm": 3.146869421005249, |
|
"learning_rate": 3.109517601043025e-05, |
|
"loss": 0.4236, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.8373494148254395, |
|
"eval_loss": 0.43542519211769104, |
|
"eval_runtime": 3.7784, |
|
"eval_samples_per_second": 659.012, |
|
"eval_steps_per_second": 82.575, |
|
"step": 110448 |
|
}, |
|
{ |
|
"epoch": 9.004237288135593, |
|
"grad_norm": 1.5620496273040771, |
|
"learning_rate": 2.98728813559322e-05, |
|
"loss": 0.4145, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 9.044980443285528, |
|
"grad_norm": 1.8520050048828125, |
|
"learning_rate": 2.8650586701434158e-05, |
|
"loss": 0.4108, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 9.085723598435463, |
|
"grad_norm": 3.6136269569396973, |
|
"learning_rate": 2.7428292046936113e-05, |
|
"loss": 0.4061, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 9.126466753585397, |
|
"grad_norm": 1.7939780950546265, |
|
"learning_rate": 2.6205997392438067e-05, |
|
"loss": 0.4247, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 9.167209908735332, |
|
"grad_norm": 1.5612313747406006, |
|
"learning_rate": 2.4983702737940025e-05, |
|
"loss": 0.4174, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 9.207953063885267, |
|
"grad_norm": 2.204183340072632, |
|
"learning_rate": 2.376140808344198e-05, |
|
"loss": 0.4185, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 9.248696219035201, |
|
"grad_norm": 3.115217447280884, |
|
"learning_rate": 2.2539113428943937e-05, |
|
"loss": 0.4106, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 9.289439374185136, |
|
"grad_norm": 2.998296022415161, |
|
"learning_rate": 2.131681877444589e-05, |
|
"loss": 0.4193, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 9.330182529335072, |
|
"grad_norm": 2.2693541049957275, |
|
"learning_rate": 2.009452411994785e-05, |
|
"loss": 0.4196, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 9.370925684485007, |
|
"grad_norm": 1.6824918985366821, |
|
"learning_rate": 1.8872229465449803e-05, |
|
"loss": 0.4116, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 9.411668839634942, |
|
"grad_norm": 3.1412432193756104, |
|
"learning_rate": 1.7649934810951758e-05, |
|
"loss": 0.4139, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 9.452411994784876, |
|
"grad_norm": 1.6451703310012817, |
|
"learning_rate": 1.6427640156453715e-05, |
|
"loss": 0.4155, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 9.493155149934811, |
|
"grad_norm": 2.260284900665283, |
|
"learning_rate": 1.520534550195567e-05, |
|
"loss": 0.4231, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 9.533898305084746, |
|
"grad_norm": 2.125267505645752, |
|
"learning_rate": 1.3983050847457626e-05, |
|
"loss": 0.4216, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 9.57464146023468, |
|
"grad_norm": 2.3000893592834473, |
|
"learning_rate": 1.2760756192959582e-05, |
|
"loss": 0.4078, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 9.615384615384615, |
|
"grad_norm": 2.3258585929870605, |
|
"learning_rate": 1.1538461538461538e-05, |
|
"loss": 0.4133, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 9.65612777053455, |
|
"grad_norm": 3.550931692123413, |
|
"learning_rate": 1.0316166883963494e-05, |
|
"loss": 0.4075, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 9.696870925684484, |
|
"grad_norm": 3.0916647911071777, |
|
"learning_rate": 9.093872229465448e-06, |
|
"loss": 0.4166, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 9.737614080834419, |
|
"grad_norm": 2.898163318634033, |
|
"learning_rate": 7.871577574967404e-06, |
|
"loss": 0.4154, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 9.778357235984355, |
|
"grad_norm": 1.945553183555603, |
|
"learning_rate": 6.649282920469361e-06, |
|
"loss": 0.4063, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 9.81910039113429, |
|
"grad_norm": 2.7394824028015137, |
|
"learning_rate": 5.426988265971316e-06, |
|
"loss": 0.4084, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 9.859843546284225, |
|
"grad_norm": 2.676351547241211, |
|
"learning_rate": 4.2046936114732716e-06, |
|
"loss": 0.4178, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 9.90058670143416, |
|
"grad_norm": 2.577317237854004, |
|
"learning_rate": 2.982398956975228e-06, |
|
"loss": 0.4139, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 9.941329856584094, |
|
"grad_norm": 4.512514114379883, |
|
"learning_rate": 1.7601043024771837e-06, |
|
"loss": 0.4115, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 9.982073011734029, |
|
"grad_norm": 2.307258129119873, |
|
"learning_rate": 5.378096479791394e-07, |
|
"loss": 0.4151, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.8361445665359497, |
|
"eval_loss": 0.4362991154193878, |
|
"eval_runtime": 3.7734, |
|
"eval_samples_per_second": 659.887, |
|
"eval_steps_per_second": 82.685, |
|
"step": 122720 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 122720, |
|
"total_flos": 2.6143616931499008e+17, |
|
"train_loss": 0.4748868410093066, |
|
"train_runtime": 8727.3231, |
|
"train_samples_per_second": 449.968, |
|
"train_steps_per_second": 14.062 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 122720, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.6143616931499008e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|