|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 15.0, |
|
"eval_steps": 500, |
|
"global_step": 184080, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04074315514993481, |
|
"grad_norm": 1.712476134300232, |
|
"learning_rate": 0.00029918513689700127, |
|
"loss": 1.0726, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08148631029986962, |
|
"grad_norm": 1.8538917303085327, |
|
"learning_rate": 0.0002983702737940026, |
|
"loss": 0.8568, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.12222946544980444, |
|
"grad_norm": 3.2072715759277344, |
|
"learning_rate": 0.0002975554106910039, |
|
"loss": 0.7747, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.16297262059973924, |
|
"grad_norm": 2.7058048248291016, |
|
"learning_rate": 0.0002967405475880052, |
|
"loss": 0.7254, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.20371577574967406, |
|
"grad_norm": 2.862517833709717, |
|
"learning_rate": 0.0002959256844850065, |
|
"loss": 0.6899, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.24445893089960888, |
|
"grad_norm": 1.9533528089523315, |
|
"learning_rate": 0.0002951108213820078, |
|
"loss": 0.6615, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.28520208604954367, |
|
"grad_norm": 1.9216587543487549, |
|
"learning_rate": 0.0002942959582790091, |
|
"loss": 0.6518, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.3259452411994785, |
|
"grad_norm": 2.019871711730957, |
|
"learning_rate": 0.0002934810951760104, |
|
"loss": 0.6282, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3666883963494133, |
|
"grad_norm": 2.8539257049560547, |
|
"learning_rate": 0.0002926662320730117, |
|
"loss": 0.6356, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4074315514993481, |
|
"grad_norm": 2.5572264194488525, |
|
"learning_rate": 0.000291851368970013, |
|
"loss": 0.6119, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.44817470664928294, |
|
"grad_norm": 2.307138681411743, |
|
"learning_rate": 0.0002910365058670143, |
|
"loss": 0.6074, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.48891786179921776, |
|
"grad_norm": 1.5693212747573853, |
|
"learning_rate": 0.0002902216427640156, |
|
"loss": 0.5995, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5296610169491526, |
|
"grad_norm": 1.5355511903762817, |
|
"learning_rate": 0.00028940677966101693, |
|
"loss": 0.5984, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5704041720990873, |
|
"grad_norm": 2.0896735191345215, |
|
"learning_rate": 0.0002885919165580182, |
|
"loss": 0.5802, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6111473272490222, |
|
"grad_norm": 2.17629075050354, |
|
"learning_rate": 0.00028777705345501956, |
|
"loss": 0.5848, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.651890482398957, |
|
"grad_norm": 1.2360671758651733, |
|
"learning_rate": 0.00028696219035202086, |
|
"loss": 0.5867, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6926336375488917, |
|
"grad_norm": 1.9337974786758423, |
|
"learning_rate": 0.00028614732724902215, |
|
"loss": 0.5769, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7333767926988266, |
|
"grad_norm": 1.8343297243118286, |
|
"learning_rate": 0.00028533246414602344, |
|
"loss": 0.5785, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7741199478487614, |
|
"grad_norm": 3.1703989505767822, |
|
"learning_rate": 0.00028451760104302473, |
|
"loss": 0.5726, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.8148631029986962, |
|
"grad_norm": 2.3858680725097656, |
|
"learning_rate": 0.000283702737940026, |
|
"loss": 0.562, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.855606258148631, |
|
"grad_norm": 2.681757688522339, |
|
"learning_rate": 0.00028288787483702737, |
|
"loss": 0.5628, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.8963494132985659, |
|
"grad_norm": 1.7328672409057617, |
|
"learning_rate": 0.00028207301173402866, |
|
"loss": 0.5584, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9370925684485006, |
|
"grad_norm": 2.7428441047668457, |
|
"learning_rate": 0.00028125814863102995, |
|
"loss": 0.5577, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.9778357235984355, |
|
"grad_norm": 2.219202995300293, |
|
"learning_rate": 0.00028044328552803124, |
|
"loss": 0.554, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.7979919910430908, |
|
"eval_loss": 0.5014437437057495, |
|
"eval_runtime": 3.8836, |
|
"eval_samples_per_second": 641.161, |
|
"eval_steps_per_second": 80.338, |
|
"step": 12272 |
|
}, |
|
{ |
|
"epoch": 1.0185788787483703, |
|
"grad_norm": 1.668083667755127, |
|
"learning_rate": 0.00027962842242503253, |
|
"loss": 0.5461, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.0593220338983051, |
|
"grad_norm": 2.8752121925354004, |
|
"learning_rate": 0.0002788135593220339, |
|
"loss": 0.5399, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.1000651890482398, |
|
"grad_norm": 1.8229767084121704, |
|
"learning_rate": 0.00027799869621903517, |
|
"loss": 0.5415, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.1408083441981747, |
|
"grad_norm": 1.681829571723938, |
|
"learning_rate": 0.00027718383311603646, |
|
"loss": 0.535, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.1815514993481095, |
|
"grad_norm": 1.5809299945831299, |
|
"learning_rate": 0.0002763689700130378, |
|
"loss": 0.5485, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.2222946544980444, |
|
"grad_norm": 1.6424498558044434, |
|
"learning_rate": 0.0002755541069100391, |
|
"loss": 0.5353, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.263037809647979, |
|
"grad_norm": 2.1335270404815674, |
|
"learning_rate": 0.0002747392438070404, |
|
"loss": 0.5356, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.303780964797914, |
|
"grad_norm": 3.0127382278442383, |
|
"learning_rate": 0.0002739243807040417, |
|
"loss": 0.53, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.3445241199478488, |
|
"grad_norm": 1.603621006011963, |
|
"learning_rate": 0.000273109517601043, |
|
"loss": 0.5298, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.3852672750977835, |
|
"grad_norm": 3.251509428024292, |
|
"learning_rate": 0.0002722946544980443, |
|
"loss": 0.5326, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.4260104302477183, |
|
"grad_norm": 1.379150390625, |
|
"learning_rate": 0.0002714797913950456, |
|
"loss": 0.5316, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.4667535853976532, |
|
"grad_norm": 1.778817892074585, |
|
"learning_rate": 0.0002706649282920469, |
|
"loss": 0.5233, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.5074967405475879, |
|
"grad_norm": 1.4545488357543945, |
|
"learning_rate": 0.00026985006518904825, |
|
"loss": 0.5292, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.548239895697523, |
|
"grad_norm": 1.7037944793701172, |
|
"learning_rate": 0.00026903520208604954, |
|
"loss": 0.5168, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.5889830508474576, |
|
"grad_norm": 2.0902042388916016, |
|
"learning_rate": 0.00026822033898305083, |
|
"loss": 0.53, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.6297262059973925, |
|
"grad_norm": 1.4677634239196777, |
|
"learning_rate": 0.0002674054758800521, |
|
"loss": 0.5258, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.6704693611473274, |
|
"grad_norm": 1.562299132347107, |
|
"learning_rate": 0.0002665906127770534, |
|
"loss": 0.5232, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.711212516297262, |
|
"grad_norm": 2.906895637512207, |
|
"learning_rate": 0.00026577574967405476, |
|
"loss": 0.5153, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.7519556714471969, |
|
"grad_norm": 3.625960350036621, |
|
"learning_rate": 0.00026496088657105605, |
|
"loss": 0.5137, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.7926988265971318, |
|
"grad_norm": 1.401186466217041, |
|
"learning_rate": 0.00026414602346805735, |
|
"loss": 0.5155, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.8334419817470664, |
|
"grad_norm": 4.507143974304199, |
|
"learning_rate": 0.00026333116036505864, |
|
"loss": 0.5239, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.8741851368970013, |
|
"grad_norm": 2.317826986312866, |
|
"learning_rate": 0.00026251629726205993, |
|
"loss": 0.5059, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.9149282920469362, |
|
"grad_norm": 1.684119701385498, |
|
"learning_rate": 0.0002617014341590612, |
|
"loss": 0.5206, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.9556714471968708, |
|
"grad_norm": 3.68534255027771, |
|
"learning_rate": 0.00026088657105606257, |
|
"loss": 0.5088, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.996414602346806, |
|
"grad_norm": 1.978193759918213, |
|
"learning_rate": 0.00026007170795306386, |
|
"loss": 0.5077, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8108433485031128, |
|
"eval_loss": 0.48567625880241394, |
|
"eval_runtime": 3.8076, |
|
"eval_samples_per_second": 653.949, |
|
"eval_steps_per_second": 81.941, |
|
"step": 24544 |
|
}, |
|
{ |
|
"epoch": 2.0371577574967406, |
|
"grad_norm": 2.52195143699646, |
|
"learning_rate": 0.0002592568448500652, |
|
"loss": 0.4994, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.077900912646675, |
|
"grad_norm": 2.3119261264801025, |
|
"learning_rate": 0.0002584419817470665, |
|
"loss": 0.4989, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.1186440677966103, |
|
"grad_norm": 1.8698070049285889, |
|
"learning_rate": 0.0002576271186440678, |
|
"loss": 0.505, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.159387222946545, |
|
"grad_norm": 1.762832760810852, |
|
"learning_rate": 0.0002568122555410691, |
|
"loss": 0.5061, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.2001303780964796, |
|
"grad_norm": 1.8201305866241455, |
|
"learning_rate": 0.00025599739243807037, |
|
"loss": 0.5009, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.2408735332464147, |
|
"grad_norm": 2.5493578910827637, |
|
"learning_rate": 0.00025518252933507166, |
|
"loss": 0.4942, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.2816166883963493, |
|
"grad_norm": 1.689488172531128, |
|
"learning_rate": 0.000254367666232073, |
|
"loss": 0.505, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.322359843546284, |
|
"grad_norm": 1.5578385591506958, |
|
"learning_rate": 0.0002535528031290743, |
|
"loss": 0.4827, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.363102998696219, |
|
"grad_norm": 1.9981471300125122, |
|
"learning_rate": 0.0002527379400260756, |
|
"loss": 0.4901, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.4038461538461537, |
|
"grad_norm": 3.95417857170105, |
|
"learning_rate": 0.0002519230769230769, |
|
"loss": 0.4954, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 2.444589308996089, |
|
"grad_norm": 2.437424898147583, |
|
"learning_rate": 0.0002511082138200782, |
|
"loss": 0.5051, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.4853324641460235, |
|
"grad_norm": 2.264540910720825, |
|
"learning_rate": 0.0002502933507170795, |
|
"loss": 0.4995, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 2.526075619295958, |
|
"grad_norm": 1.7900969982147217, |
|
"learning_rate": 0.0002494784876140808, |
|
"loss": 0.5043, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.5668187744458932, |
|
"grad_norm": 2.914376735687256, |
|
"learning_rate": 0.0002486636245110821, |
|
"loss": 0.501, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 2.607561929595828, |
|
"grad_norm": 2.3340747356414795, |
|
"learning_rate": 0.00024784876140808345, |
|
"loss": 0.4856, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.648305084745763, |
|
"grad_norm": 1.6408894062042236, |
|
"learning_rate": 0.00024703389830508474, |
|
"loss": 0.4925, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.6890482398956976, |
|
"grad_norm": 2.726527690887451, |
|
"learning_rate": 0.00024621903520208603, |
|
"loss": 0.4925, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.7297913950456323, |
|
"grad_norm": 1.7461756467819214, |
|
"learning_rate": 0.0002454041720990873, |
|
"loss": 0.4949, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.770534550195567, |
|
"grad_norm": 1.021328330039978, |
|
"learning_rate": 0.0002445893089960886, |
|
"loss": 0.5003, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.811277705345502, |
|
"grad_norm": 1.8558237552642822, |
|
"learning_rate": 0.00024377444589308996, |
|
"loss": 0.4969, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.8520208604954367, |
|
"grad_norm": 1.888190507888794, |
|
"learning_rate": 0.00024295958279009125, |
|
"loss": 0.4763, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.8927640156453718, |
|
"grad_norm": 1.9714635610580444, |
|
"learning_rate": 0.00024214471968709255, |
|
"loss": 0.4823, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.9335071707953064, |
|
"grad_norm": 2.248117446899414, |
|
"learning_rate": 0.00024132985658409386, |
|
"loss": 0.4971, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.974250325945241, |
|
"grad_norm": 2.586214065551758, |
|
"learning_rate": 0.00024051499348109516, |
|
"loss": 0.4793, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7767068147659302, |
|
"eval_loss": 0.5534041523933411, |
|
"eval_runtime": 3.8006, |
|
"eval_samples_per_second": 655.161, |
|
"eval_steps_per_second": 82.092, |
|
"step": 36816 |
|
}, |
|
{ |
|
"epoch": 3.014993481095176, |
|
"grad_norm": 1.693083643913269, |
|
"learning_rate": 0.00023970013037809645, |
|
"loss": 0.4938, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.055736636245111, |
|
"grad_norm": 1.6453471183776855, |
|
"learning_rate": 0.00023888526727509777, |
|
"loss": 0.479, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.0964797913950455, |
|
"grad_norm": 2.535121202468872, |
|
"learning_rate": 0.00023807040417209906, |
|
"loss": 0.4721, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 3.1372229465449806, |
|
"grad_norm": 2.6568410396575928, |
|
"learning_rate": 0.00023725554106910038, |
|
"loss": 0.4825, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 3.1779661016949152, |
|
"grad_norm": 1.9132291078567505, |
|
"learning_rate": 0.00023644067796610167, |
|
"loss": 0.4706, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 3.21870925684485, |
|
"grad_norm": 1.7388460636138916, |
|
"learning_rate": 0.00023562581486310296, |
|
"loss": 0.4722, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 3.259452411994785, |
|
"grad_norm": 2.760587692260742, |
|
"learning_rate": 0.0002348109517601043, |
|
"loss": 0.4776, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 3.3001955671447196, |
|
"grad_norm": 1.4599848985671997, |
|
"learning_rate": 0.0002339960886571056, |
|
"loss": 0.4784, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 3.3409387222946547, |
|
"grad_norm": 1.7198021411895752, |
|
"learning_rate": 0.0002331812255541069, |
|
"loss": 0.4727, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 3.3816818774445894, |
|
"grad_norm": 1.6919358968734741, |
|
"learning_rate": 0.0002323663624511082, |
|
"loss": 0.4851, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 3.422425032594524, |
|
"grad_norm": 2.1125240325927734, |
|
"learning_rate": 0.0002315514993481095, |
|
"loss": 0.4593, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 3.463168187744459, |
|
"grad_norm": 1.8193351030349731, |
|
"learning_rate": 0.00023073663624511082, |
|
"loss": 0.4753, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 3.5039113428943938, |
|
"grad_norm": 1.8467501401901245, |
|
"learning_rate": 0.0002299217731421121, |
|
"loss": 0.4769, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 3.5446544980443284, |
|
"grad_norm": 2.4582698345184326, |
|
"learning_rate": 0.0002291069100391134, |
|
"loss": 0.4725, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 3.5853976531942635, |
|
"grad_norm": 2.6381258964538574, |
|
"learning_rate": 0.00022829204693611472, |
|
"loss": 0.471, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 3.626140808344198, |
|
"grad_norm": 3.26521635055542, |
|
"learning_rate": 0.000227477183833116, |
|
"loss": 0.47, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 3.666883963494133, |
|
"grad_norm": 3.6461341381073, |
|
"learning_rate": 0.0002266623207301173, |
|
"loss": 0.4836, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 3.707627118644068, |
|
"grad_norm": 4.348172664642334, |
|
"learning_rate": 0.00022584745762711862, |
|
"loss": 0.478, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 3.7483702737940026, |
|
"grad_norm": 2.3575916290283203, |
|
"learning_rate": 0.0002250325945241199, |
|
"loss": 0.468, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 3.7891134289439377, |
|
"grad_norm": 1.4927334785461426, |
|
"learning_rate": 0.0002242177314211212, |
|
"loss": 0.4611, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 3.8298565840938723, |
|
"grad_norm": 2.9326066970825195, |
|
"learning_rate": 0.00022340286831812255, |
|
"loss": 0.476, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 3.870599739243807, |
|
"grad_norm": 1.4343719482421875, |
|
"learning_rate": 0.00022258800521512384, |
|
"loss": 0.4792, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 3.9113428943937416, |
|
"grad_norm": 2.0450692176818848, |
|
"learning_rate": 0.00022177314211212516, |
|
"loss": 0.4822, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.9520860495436767, |
|
"grad_norm": 1.492274284362793, |
|
"learning_rate": 0.00022095827900912645, |
|
"loss": 0.4602, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 3.9928292046936114, |
|
"grad_norm": 2.1512324810028076, |
|
"learning_rate": 0.00022014341590612774, |
|
"loss": 0.4791, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.8224899768829346, |
|
"eval_loss": 0.4442506432533264, |
|
"eval_runtime": 3.8847, |
|
"eval_samples_per_second": 640.977, |
|
"eval_steps_per_second": 80.315, |
|
"step": 49088 |
|
}, |
|
{ |
|
"epoch": 4.0335723598435465, |
|
"grad_norm": 1.4296700954437256, |
|
"learning_rate": 0.00021932855280312906, |
|
"loss": 0.4649, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 4.074315514993481, |
|
"grad_norm": 3.133362293243408, |
|
"learning_rate": 0.00021851368970013035, |
|
"loss": 0.4501, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 4.115058670143416, |
|
"grad_norm": 2.441312551498413, |
|
"learning_rate": 0.00021769882659713165, |
|
"loss": 0.4592, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 4.15580182529335, |
|
"grad_norm": 2.3577542304992676, |
|
"learning_rate": 0.00021688396349413296, |
|
"loss": 0.4532, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 4.196544980443286, |
|
"grad_norm": 2.5034992694854736, |
|
"learning_rate": 0.00021606910039113426, |
|
"loss": 0.4537, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 4.237288135593221, |
|
"grad_norm": 2.869847297668457, |
|
"learning_rate": 0.0002152542372881356, |
|
"loss": 0.4558, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 4.278031290743155, |
|
"grad_norm": 2.8850553035736084, |
|
"learning_rate": 0.0002144393741851369, |
|
"loss": 0.4559, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 4.31877444589309, |
|
"grad_norm": 2.0999245643615723, |
|
"learning_rate": 0.00021362451108213819, |
|
"loss": 0.4651, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 4.3595176010430245, |
|
"grad_norm": 2.392293930053711, |
|
"learning_rate": 0.0002128096479791395, |
|
"loss": 0.4621, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 4.400260756192959, |
|
"grad_norm": 1.9911226034164429, |
|
"learning_rate": 0.0002119947848761408, |
|
"loss": 0.4623, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 4.441003911342895, |
|
"grad_norm": 2.728233575820923, |
|
"learning_rate": 0.0002111799217731421, |
|
"loss": 0.4589, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 4.481747066492829, |
|
"grad_norm": 1.9613022804260254, |
|
"learning_rate": 0.0002103650586701434, |
|
"loss": 0.4588, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 4.522490221642764, |
|
"grad_norm": 2.742570161819458, |
|
"learning_rate": 0.0002095501955671447, |
|
"loss": 0.4541, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 4.563233376792699, |
|
"grad_norm": 1.8285661935806274, |
|
"learning_rate": 0.00020873533246414602, |
|
"loss": 0.4563, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 4.603976531942633, |
|
"grad_norm": 2.059859037399292, |
|
"learning_rate": 0.0002079204693611473, |
|
"loss": 0.459, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 4.644719687092568, |
|
"grad_norm": 2.426276445388794, |
|
"learning_rate": 0.0002071056062581486, |
|
"loss": 0.465, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 4.6854628422425035, |
|
"grad_norm": 2.499319553375244, |
|
"learning_rate": 0.00020629074315514992, |
|
"loss": 0.463, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 4.726205997392438, |
|
"grad_norm": 2.2192931175231934, |
|
"learning_rate": 0.0002054758800521512, |
|
"loss": 0.4556, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 4.766949152542373, |
|
"grad_norm": 3.1158642768859863, |
|
"learning_rate": 0.0002046610169491525, |
|
"loss": 0.457, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 4.8076923076923075, |
|
"grad_norm": 1.7781977653503418, |
|
"learning_rate": 0.00020384615384615385, |
|
"loss": 0.4664, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 4.848435462842242, |
|
"grad_norm": 1.9802038669586182, |
|
"learning_rate": 0.00020303129074315514, |
|
"loss": 0.4565, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 4.889178617992178, |
|
"grad_norm": 1.7128177881240845, |
|
"learning_rate": 0.00020221642764015643, |
|
"loss": 0.4573, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 4.929921773142112, |
|
"grad_norm": 3.5915613174438477, |
|
"learning_rate": 0.00020140156453715775, |
|
"loss": 0.4603, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 4.970664928292047, |
|
"grad_norm": 2.1721646785736084, |
|
"learning_rate": 0.00020058670143415904, |
|
"loss": 0.4541, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.8257027864456177, |
|
"eval_loss": 0.44336947798728943, |
|
"eval_runtime": 3.8541, |
|
"eval_samples_per_second": 646.057, |
|
"eval_steps_per_second": 80.952, |
|
"step": 61360 |
|
}, |
|
{ |
|
"epoch": 5.011408083441982, |
|
"grad_norm": 4.467598915100098, |
|
"learning_rate": 0.00019977183833116036, |
|
"loss": 0.462, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 5.052151238591916, |
|
"grad_norm": 3.215426445007324, |
|
"learning_rate": 0.00019895697522816165, |
|
"loss": 0.4433, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 5.092894393741851, |
|
"grad_norm": 3.5593578815460205, |
|
"learning_rate": 0.00019814211212516294, |
|
"loss": 0.4412, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 5.1336375488917865, |
|
"grad_norm": 1.5453704595565796, |
|
"learning_rate": 0.00019732724902216426, |
|
"loss": 0.4392, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 5.174380704041721, |
|
"grad_norm": 3.003392219543457, |
|
"learning_rate": 0.00019651238591916555, |
|
"loss": 0.4434, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 5.215123859191656, |
|
"grad_norm": 1.394499659538269, |
|
"learning_rate": 0.00019569752281616684, |
|
"loss": 0.4377, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 5.25586701434159, |
|
"grad_norm": 1.8028594255447388, |
|
"learning_rate": 0.00019488265971316816, |
|
"loss": 0.4457, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 5.296610169491525, |
|
"grad_norm": 2.3185994625091553, |
|
"learning_rate": 0.00019406779661016945, |
|
"loss": 0.4396, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 5.337353324641461, |
|
"grad_norm": 2.245239019393921, |
|
"learning_rate": 0.0001932529335071708, |
|
"loss": 0.4548, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 5.378096479791395, |
|
"grad_norm": 1.6174944639205933, |
|
"learning_rate": 0.0001924380704041721, |
|
"loss": 0.4431, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 5.41883963494133, |
|
"grad_norm": 2.5241715908050537, |
|
"learning_rate": 0.00019162320730117338, |
|
"loss": 0.4382, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 5.459582790091265, |
|
"grad_norm": 3.4499781131744385, |
|
"learning_rate": 0.0001908083441981747, |
|
"loss": 0.4552, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 5.500325945241199, |
|
"grad_norm": 2.127242088317871, |
|
"learning_rate": 0.000189993481095176, |
|
"loss": 0.4511, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 5.541069100391134, |
|
"grad_norm": 2.030122756958008, |
|
"learning_rate": 0.00018917861799217729, |
|
"loss": 0.4553, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 5.581812255541069, |
|
"grad_norm": 2.0190391540527344, |
|
"learning_rate": 0.0001883637548891786, |
|
"loss": 0.4471, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 5.622555410691004, |
|
"grad_norm": 3.642808198928833, |
|
"learning_rate": 0.0001875488917861799, |
|
"loss": 0.4494, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 5.663298565840939, |
|
"grad_norm": 2.013524055480957, |
|
"learning_rate": 0.00018673402868318121, |
|
"loss": 0.4425, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 5.704041720990873, |
|
"grad_norm": 3.961810350418091, |
|
"learning_rate": 0.0001859191655801825, |
|
"loss": 0.4438, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 5.744784876140808, |
|
"grad_norm": 1.9334365129470825, |
|
"learning_rate": 0.0001851043024771838, |
|
"loss": 0.4477, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 5.7855280312907436, |
|
"grad_norm": 2.67224383354187, |
|
"learning_rate": 0.00018428943937418514, |
|
"loss": 0.4522, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 5.826271186440678, |
|
"grad_norm": 2.349132776260376, |
|
"learning_rate": 0.00018347457627118644, |
|
"loss": 0.4467, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 5.867014341590613, |
|
"grad_norm": 2.8674731254577637, |
|
"learning_rate": 0.00018265971316818773, |
|
"loss": 0.4379, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 5.9077574967405475, |
|
"grad_norm": 3.5750834941864014, |
|
"learning_rate": 0.00018184485006518905, |
|
"loss": 0.4445, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 5.948500651890482, |
|
"grad_norm": 2.297048330307007, |
|
"learning_rate": 0.00018102998696219034, |
|
"loss": 0.4415, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 5.989243807040417, |
|
"grad_norm": 2.4230237007141113, |
|
"learning_rate": 0.00018021512385919163, |
|
"loss": 0.4405, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.83253014087677, |
|
"eval_loss": 0.42729276418685913, |
|
"eval_runtime": 3.8615, |
|
"eval_samples_per_second": 644.821, |
|
"eval_steps_per_second": 80.797, |
|
"step": 73632 |
|
}, |
|
{ |
|
"epoch": 6.029986962190352, |
|
"grad_norm": 1.7936686277389526, |
|
"learning_rate": 0.00017940026075619295, |
|
"loss": 0.437, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 6.070730117340287, |
|
"grad_norm": 2.2741811275482178, |
|
"learning_rate": 0.00017858539765319424, |
|
"loss": 0.434, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 6.111473272490222, |
|
"grad_norm": 2.475470542907715, |
|
"learning_rate": 0.00017777053455019556, |
|
"loss": 0.4298, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 6.152216427640156, |
|
"grad_norm": 1.6614030599594116, |
|
"learning_rate": 0.00017695567144719685, |
|
"loss": 0.43, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 6.192959582790091, |
|
"grad_norm": 2.708757162094116, |
|
"learning_rate": 0.00017614080834419814, |
|
"loss": 0.4335, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 6.2337027379400265, |
|
"grad_norm": 2.0995919704437256, |
|
"learning_rate": 0.00017532594524119946, |
|
"loss": 0.4378, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 6.274445893089961, |
|
"grad_norm": 2.3114981651306152, |
|
"learning_rate": 0.00017451108213820075, |
|
"loss": 0.445, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 6.315189048239896, |
|
"grad_norm": 2.9290952682495117, |
|
"learning_rate": 0.00017369621903520204, |
|
"loss": 0.4257, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 6.3559322033898304, |
|
"grad_norm": 3.3918747901916504, |
|
"learning_rate": 0.0001728813559322034, |
|
"loss": 0.4375, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 6.396675358539765, |
|
"grad_norm": 2.7095093727111816, |
|
"learning_rate": 0.00017206649282920468, |
|
"loss": 0.4464, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 6.4374185136897, |
|
"grad_norm": 1.904373049736023, |
|
"learning_rate": 0.000171251629726206, |
|
"loss": 0.4328, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 6.478161668839635, |
|
"grad_norm": 2.086244583129883, |
|
"learning_rate": 0.0001704367666232073, |
|
"loss": 0.4379, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 6.51890482398957, |
|
"grad_norm": 2.2188174724578857, |
|
"learning_rate": 0.00016962190352020858, |
|
"loss": 0.432, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 6.559647979139505, |
|
"grad_norm": 2.3787903785705566, |
|
"learning_rate": 0.0001688070404172099, |
|
"loss": 0.4368, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 6.600391134289439, |
|
"grad_norm": 2.9371328353881836, |
|
"learning_rate": 0.0001679921773142112, |
|
"loss": 0.4312, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 6.641134289439374, |
|
"grad_norm": 2.65150785446167, |
|
"learning_rate": 0.00016717731421121248, |
|
"loss": 0.4376, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 6.681877444589309, |
|
"grad_norm": 2.1882104873657227, |
|
"learning_rate": 0.0001663624511082138, |
|
"loss": 0.4365, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 6.722620599739244, |
|
"grad_norm": 2.618929862976074, |
|
"learning_rate": 0.0001655475880052151, |
|
"loss": 0.427, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 6.763363754889179, |
|
"grad_norm": 2.831859588623047, |
|
"learning_rate": 0.00016473272490221644, |
|
"loss": 0.4322, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 6.804106910039113, |
|
"grad_norm": 2.4017553329467773, |
|
"learning_rate": 0.00016391786179921773, |
|
"loss": 0.4425, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 6.844850065189048, |
|
"grad_norm": 2.4107367992401123, |
|
"learning_rate": 0.00016310299869621902, |
|
"loss": 0.4261, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 6.885593220338983, |
|
"grad_norm": 1.5284911394119263, |
|
"learning_rate": 0.00016228813559322034, |
|
"loss": 0.428, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 6.926336375488918, |
|
"grad_norm": 1.875391960144043, |
|
"learning_rate": 0.00016147327249022163, |
|
"loss": 0.4248, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 6.967079530638853, |
|
"grad_norm": 2.9802966117858887, |
|
"learning_rate": 0.00016065840938722293, |
|
"loss": 0.4208, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.8313252925872803, |
|
"eval_loss": 0.44049832224845886, |
|
"eval_runtime": 3.8652, |
|
"eval_samples_per_second": 644.21, |
|
"eval_steps_per_second": 80.72, |
|
"step": 85904 |
|
}, |
|
{ |
|
"epoch": 7.0078226857887875, |
|
"grad_norm": 4.224608898162842, |
|
"learning_rate": 0.00015984354628422424, |
|
"loss": 0.4366, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 7.048565840938722, |
|
"grad_norm": 2.473148822784424, |
|
"learning_rate": 0.00015902868318122554, |
|
"loss": 0.4106, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 7.089308996088657, |
|
"grad_norm": 2.758864164352417, |
|
"learning_rate": 0.00015821382007822685, |
|
"loss": 0.4249, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 7.130052151238592, |
|
"grad_norm": 2.041701555252075, |
|
"learning_rate": 0.00015739895697522815, |
|
"loss": 0.4187, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 7.170795306388527, |
|
"grad_norm": 2.219381093978882, |
|
"learning_rate": 0.00015658409387222944, |
|
"loss": 0.4239, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 7.211538461538462, |
|
"grad_norm": 2.239011526107788, |
|
"learning_rate": 0.00015576923076923076, |
|
"loss": 0.4256, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 7.252281616688396, |
|
"grad_norm": 1.8178561925888062, |
|
"learning_rate": 0.00015495436766623205, |
|
"loss": 0.4273, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 7.293024771838331, |
|
"grad_norm": 3.339010238647461, |
|
"learning_rate": 0.00015413950456323334, |
|
"loss": 0.4273, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 7.333767926988266, |
|
"grad_norm": 1.5560699701309204, |
|
"learning_rate": 0.00015332464146023469, |
|
"loss": 0.4254, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 7.374511082138201, |
|
"grad_norm": 2.376141309738159, |
|
"learning_rate": 0.00015250977835723598, |
|
"loss": 0.4231, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 7.415254237288136, |
|
"grad_norm": 2.4076344966888428, |
|
"learning_rate": 0.00015169491525423727, |
|
"loss": 0.4126, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 7.4559973924380705, |
|
"grad_norm": 1.661089301109314, |
|
"learning_rate": 0.0001508800521512386, |
|
"loss": 0.4224, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 7.496740547588005, |
|
"grad_norm": 2.04123854637146, |
|
"learning_rate": 0.00015006518904823988, |
|
"loss": 0.4272, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 7.53748370273794, |
|
"grad_norm": 1.8965297937393188, |
|
"learning_rate": 0.00014925032594524117, |
|
"loss": 0.4211, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 7.578226857887875, |
|
"grad_norm": 2.6887612342834473, |
|
"learning_rate": 0.0001484354628422425, |
|
"loss": 0.4249, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 7.61897001303781, |
|
"grad_norm": 2.034926414489746, |
|
"learning_rate": 0.0001476205997392438, |
|
"loss": 0.4289, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 7.659713168187745, |
|
"grad_norm": 3.313100814819336, |
|
"learning_rate": 0.0001468057366362451, |
|
"loss": 0.4127, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 7.700456323337679, |
|
"grad_norm": 2.1167919635772705, |
|
"learning_rate": 0.0001459908735332464, |
|
"loss": 0.4264, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 7.741199478487614, |
|
"grad_norm": 2.0670166015625, |
|
"learning_rate": 0.0001451760104302477, |
|
"loss": 0.4317, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 7.781942633637549, |
|
"grad_norm": 3.6086575984954834, |
|
"learning_rate": 0.00014436114732724903, |
|
"loss": 0.4232, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 7.822685788787483, |
|
"grad_norm": 2.8180601596832275, |
|
"learning_rate": 0.00014354628422425032, |
|
"loss": 0.424, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 7.863428943937419, |
|
"grad_norm": 2.9117753505706787, |
|
"learning_rate": 0.0001427314211212516, |
|
"loss": 0.4225, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 7.904172099087353, |
|
"grad_norm": 2.2281785011291504, |
|
"learning_rate": 0.00014191655801825293, |
|
"loss": 0.4236, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 7.944915254237288, |
|
"grad_norm": 3.034166097640991, |
|
"learning_rate": 0.00014110169491525422, |
|
"loss": 0.4283, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 7.985658409387223, |
|
"grad_norm": 2.297738552093506, |
|
"learning_rate": 0.00014028683181225551, |
|
"loss": 0.4278, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.8297188878059387, |
|
"eval_loss": 0.43755677342414856, |
|
"eval_runtime": 3.7829, |
|
"eval_samples_per_second": 658.227, |
|
"eval_steps_per_second": 82.477, |
|
"step": 98176 |
|
}, |
|
{ |
|
"epoch": 8.026401564537158, |
|
"grad_norm": 2.4886224269866943, |
|
"learning_rate": 0.00013947196870925683, |
|
"loss": 0.4122, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 8.067144719687093, |
|
"grad_norm": 1.8089336156845093, |
|
"learning_rate": 0.00013865710560625815, |
|
"loss": 0.403, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 8.107887874837028, |
|
"grad_norm": 3.1478216648101807, |
|
"learning_rate": 0.00013784224250325944, |
|
"loss": 0.4078, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 8.148631029986962, |
|
"grad_norm": 3.5064048767089844, |
|
"learning_rate": 0.00013702737940026073, |
|
"loss": 0.4212, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 8.189374185136897, |
|
"grad_norm": 3.1338980197906494, |
|
"learning_rate": 0.00013621251629726205, |
|
"loss": 0.4102, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 8.230117340286832, |
|
"grad_norm": 2.583284616470337, |
|
"learning_rate": 0.00013539765319426334, |
|
"loss": 0.4113, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 8.270860495436766, |
|
"grad_norm": 3.1805083751678467, |
|
"learning_rate": 0.00013458279009126466, |
|
"loss": 0.4144, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 8.3116036505867, |
|
"grad_norm": 2.0652964115142822, |
|
"learning_rate": 0.00013376792698826596, |
|
"loss": 0.4077, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 8.352346805736635, |
|
"grad_norm": 1.8258506059646606, |
|
"learning_rate": 0.00013295306388526727, |
|
"loss": 0.4151, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 8.393089960886572, |
|
"grad_norm": 2.371155261993408, |
|
"learning_rate": 0.00013213820078226857, |
|
"loss": 0.4225, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 8.433833116036507, |
|
"grad_norm": 3.617539405822754, |
|
"learning_rate": 0.00013132333767926986, |
|
"loss": 0.4158, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 8.474576271186441, |
|
"grad_norm": 2.629683017730713, |
|
"learning_rate": 0.00013050847457627118, |
|
"loss": 0.4099, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 8.515319426336376, |
|
"grad_norm": 2.359873056411743, |
|
"learning_rate": 0.00012969361147327247, |
|
"loss": 0.4171, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 8.55606258148631, |
|
"grad_norm": 2.8503479957580566, |
|
"learning_rate": 0.00012887874837027379, |
|
"loss": 0.4194, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 8.596805736636245, |
|
"grad_norm": 2.8921594619750977, |
|
"learning_rate": 0.00012806388526727508, |
|
"loss": 0.4125, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 8.63754889178618, |
|
"grad_norm": 1.8355835676193237, |
|
"learning_rate": 0.0001272490221642764, |
|
"loss": 0.412, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 8.678292046936114, |
|
"grad_norm": 3.0607216358184814, |
|
"learning_rate": 0.0001264341590612777, |
|
"loss": 0.4265, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 8.719035202086049, |
|
"grad_norm": 2.338379144668579, |
|
"learning_rate": 0.000125619295958279, |
|
"loss": 0.4142, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 8.759778357235984, |
|
"grad_norm": 2.316218137741089, |
|
"learning_rate": 0.0001248044328552803, |
|
"loss": 0.4099, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 8.800521512385918, |
|
"grad_norm": 2.4564082622528076, |
|
"learning_rate": 0.0001239895697522816, |
|
"loss": 0.4194, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 8.841264667535853, |
|
"grad_norm": 1.8719547986984253, |
|
"learning_rate": 0.0001231747066492829, |
|
"loss": 0.4132, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 8.88200782268579, |
|
"grad_norm": 3.052569627761841, |
|
"learning_rate": 0.00012235984354628423, |
|
"loss": 0.4194, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 8.922750977835724, |
|
"grad_norm": 2.181389808654785, |
|
"learning_rate": 0.0001215449804432855, |
|
"loss": 0.417, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 8.963494132985659, |
|
"grad_norm": 3.063595771789551, |
|
"learning_rate": 0.00012073011734028682, |
|
"loss": 0.422, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.833734929561615, |
|
"eval_loss": 0.43286681175231934, |
|
"eval_runtime": 3.8739, |
|
"eval_samples_per_second": 642.755, |
|
"eval_steps_per_second": 80.538, |
|
"step": 110448 |
|
}, |
|
{ |
|
"epoch": 9.004237288135593, |
|
"grad_norm": 2.0804457664489746, |
|
"learning_rate": 0.00011991525423728813, |
|
"loss": 0.4072, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 9.044980443285528, |
|
"grad_norm": 1.9648699760437012, |
|
"learning_rate": 0.00011910039113428943, |
|
"loss": 0.4031, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 9.085723598435463, |
|
"grad_norm": 3.1396656036376953, |
|
"learning_rate": 0.00011828552803129073, |
|
"loss": 0.3942, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 9.126466753585397, |
|
"grad_norm": 2.4233107566833496, |
|
"learning_rate": 0.00011747066492829203, |
|
"loss": 0.4171, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 9.167209908735332, |
|
"grad_norm": 1.7238380908966064, |
|
"learning_rate": 0.00011665580182529335, |
|
"loss": 0.4056, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 9.207953063885267, |
|
"grad_norm": 2.2021853923797607, |
|
"learning_rate": 0.00011584093872229466, |
|
"loss": 0.4089, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 9.248696219035201, |
|
"grad_norm": 2.9419503211975098, |
|
"learning_rate": 0.00011502607561929595, |
|
"loss": 0.399, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 9.289439374185136, |
|
"grad_norm": 2.092937469482422, |
|
"learning_rate": 0.00011421121251629725, |
|
"loss": 0.4113, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 9.330182529335072, |
|
"grad_norm": 2.0860626697540283, |
|
"learning_rate": 0.00011339634941329856, |
|
"loss": 0.4108, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 9.370925684485007, |
|
"grad_norm": 1.9479416608810425, |
|
"learning_rate": 0.00011258148631029986, |
|
"loss": 0.4068, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 9.411668839634942, |
|
"grad_norm": 2.6916277408599854, |
|
"learning_rate": 0.00011176662320730115, |
|
"loss": 0.4061, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 9.452411994784876, |
|
"grad_norm": 1.601837158203125, |
|
"learning_rate": 0.00011095176010430247, |
|
"loss": 0.4074, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 9.493155149934811, |
|
"grad_norm": 2.331357002258301, |
|
"learning_rate": 0.00011013689700130378, |
|
"loss": 0.4118, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 9.533898305084746, |
|
"grad_norm": 2.559669256210327, |
|
"learning_rate": 0.00010932203389830507, |
|
"loss": 0.4142, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 9.57464146023468, |
|
"grad_norm": 2.3902297019958496, |
|
"learning_rate": 0.00010850717079530637, |
|
"loss": 0.3977, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 9.615384615384615, |
|
"grad_norm": 2.525848388671875, |
|
"learning_rate": 0.00010769230769230768, |
|
"loss": 0.4037, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 9.65612777053455, |
|
"grad_norm": 3.530219316482544, |
|
"learning_rate": 0.00010687744458930898, |
|
"loss": 0.4019, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 9.696870925684484, |
|
"grad_norm": 2.741429090499878, |
|
"learning_rate": 0.00010606258148631028, |
|
"loss": 0.4096, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 9.737614080834419, |
|
"grad_norm": 3.318553924560547, |
|
"learning_rate": 0.0001052477183833116, |
|
"loss": 0.4056, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 9.778357235984355, |
|
"grad_norm": 2.6523921489715576, |
|
"learning_rate": 0.0001044328552803129, |
|
"loss": 0.4007, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 9.81910039113429, |
|
"grad_norm": 3.7088372707366943, |
|
"learning_rate": 0.0001036179921773142, |
|
"loss": 0.4016, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 9.859843546284225, |
|
"grad_norm": 2.5519940853118896, |
|
"learning_rate": 0.0001028031290743155, |
|
"loss": 0.4143, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 9.90058670143416, |
|
"grad_norm": 2.149285316467285, |
|
"learning_rate": 0.0001019882659713168, |
|
"loss": 0.4083, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 9.941329856584094, |
|
"grad_norm": 4.22469425201416, |
|
"learning_rate": 0.00010117340286831812, |
|
"loss": 0.404, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 9.982073011734029, |
|
"grad_norm": 2.2363908290863037, |
|
"learning_rate": 0.00010035853976531943, |
|
"loss": 0.4085, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.8401606678962708, |
|
"eval_loss": 0.42958390712738037, |
|
"eval_runtime": 3.786, |
|
"eval_samples_per_second": 657.679, |
|
"eval_steps_per_second": 82.408, |
|
"step": 122720 |
|
}, |
|
{ |
|
"epoch": 10.022816166883963, |
|
"grad_norm": 3.384526491165161, |
|
"learning_rate": 9.954367666232072e-05, |
|
"loss": 0.3999, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 10.063559322033898, |
|
"grad_norm": 3.051342725753784, |
|
"learning_rate": 9.872881355932202e-05, |
|
"loss": 0.3996, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 10.104302477183833, |
|
"grad_norm": 3.707674026489258, |
|
"learning_rate": 9.791395045632333e-05, |
|
"loss": 0.4043, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 10.145045632333767, |
|
"grad_norm": 1.7124032974243164, |
|
"learning_rate": 9.709908735332463e-05, |
|
"loss": 0.3918, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 10.185788787483702, |
|
"grad_norm": 2.3350818157196045, |
|
"learning_rate": 9.628422425032592e-05, |
|
"loss": 0.396, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 10.226531942633638, |
|
"grad_norm": 1.8520114421844482, |
|
"learning_rate": 9.546936114732724e-05, |
|
"loss": 0.3906, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 10.267275097783573, |
|
"grad_norm": 2.7649943828582764, |
|
"learning_rate": 9.465449804432855e-05, |
|
"loss": 0.3902, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 10.308018252933508, |
|
"grad_norm": 3.0913712978363037, |
|
"learning_rate": 9.383963494132985e-05, |
|
"loss": 0.4061, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 10.348761408083442, |
|
"grad_norm": 3.6730563640594482, |
|
"learning_rate": 9.302477183833115e-05, |
|
"loss": 0.4036, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 10.389504563233377, |
|
"grad_norm": 2.8968472480773926, |
|
"learning_rate": 9.220990873533245e-05, |
|
"loss": 0.3948, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 10.430247718383312, |
|
"grad_norm": 1.9545537233352661, |
|
"learning_rate": 9.139504563233377e-05, |
|
"loss": 0.4039, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 10.470990873533246, |
|
"grad_norm": 2.1482009887695312, |
|
"learning_rate": 9.058018252933507e-05, |
|
"loss": 0.4032, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 10.51173402868318, |
|
"grad_norm": 4.479248046875, |
|
"learning_rate": 8.976531942633637e-05, |
|
"loss": 0.3827, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 10.552477183833116, |
|
"grad_norm": 2.6518211364746094, |
|
"learning_rate": 8.895045632333767e-05, |
|
"loss": 0.3963, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 10.59322033898305, |
|
"grad_norm": 2.565751314163208, |
|
"learning_rate": 8.813559322033898e-05, |
|
"loss": 0.4135, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 10.633963494132985, |
|
"grad_norm": 3.31779146194458, |
|
"learning_rate": 8.732073011734028e-05, |
|
"loss": 0.4073, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 10.674706649282921, |
|
"grad_norm": 1.9514780044555664, |
|
"learning_rate": 8.650586701434157e-05, |
|
"loss": 0.401, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 10.715449804432856, |
|
"grad_norm": 4.615423679351807, |
|
"learning_rate": 8.569100391134289e-05, |
|
"loss": 0.3972, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 10.75619295958279, |
|
"grad_norm": 3.4876339435577393, |
|
"learning_rate": 8.48761408083442e-05, |
|
"loss": 0.3958, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 10.796936114732725, |
|
"grad_norm": 1.969255805015564, |
|
"learning_rate": 8.406127770534549e-05, |
|
"loss": 0.3997, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 10.83767926988266, |
|
"grad_norm": 3.2826197147369385, |
|
"learning_rate": 8.32464146023468e-05, |
|
"loss": 0.4014, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 10.878422425032594, |
|
"grad_norm": 2.9294662475585938, |
|
"learning_rate": 8.24315514993481e-05, |
|
"loss": 0.3942, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 10.91916558018253, |
|
"grad_norm": 2.2191972732543945, |
|
"learning_rate": 8.161668839634942e-05, |
|
"loss": 0.3821, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 10.959908735332464, |
|
"grad_norm": 2.2126150131225586, |
|
"learning_rate": 8.080182529335071e-05, |
|
"loss": 0.396, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.8333333134651184, |
|
"eval_loss": 0.4348280727863312, |
|
"eval_runtime": 3.8732, |
|
"eval_samples_per_second": 642.875, |
|
"eval_steps_per_second": 80.553, |
|
"step": 134992 |
|
}, |
|
{ |
|
"epoch": 11.000651890482398, |
|
"grad_norm": 1.9272228479385376, |
|
"learning_rate": 7.998696219035201e-05, |
|
"loss": 0.409, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 11.041395045632333, |
|
"grad_norm": 3.325286865234375, |
|
"learning_rate": 7.917209908735332e-05, |
|
"loss": 0.3948, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 11.082138200782268, |
|
"grad_norm": 2.996323585510254, |
|
"learning_rate": 7.835723598435462e-05, |
|
"loss": 0.3884, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 11.122881355932204, |
|
"grad_norm": 2.5405139923095703, |
|
"learning_rate": 7.754237288135592e-05, |
|
"loss": 0.3932, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 11.163624511082139, |
|
"grad_norm": 2.4877593517303467, |
|
"learning_rate": 7.672750977835722e-05, |
|
"loss": 0.3908, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 11.204367666232073, |
|
"grad_norm": 2.917015552520752, |
|
"learning_rate": 7.591264667535854e-05, |
|
"loss": 0.3827, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 11.245110821382008, |
|
"grad_norm": 2.060572624206543, |
|
"learning_rate": 7.509778357235985e-05, |
|
"loss": 0.3938, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 11.285853976531943, |
|
"grad_norm": 3.6868770122528076, |
|
"learning_rate": 7.428292046936114e-05, |
|
"loss": 0.3943, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 11.326597131681877, |
|
"grad_norm": 2.118516683578491, |
|
"learning_rate": 7.346805736636244e-05, |
|
"loss": 0.3871, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 11.367340286831812, |
|
"grad_norm": 2.2013978958129883, |
|
"learning_rate": 7.265319426336375e-05, |
|
"loss": 0.3875, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 11.408083441981747, |
|
"grad_norm": 2.284522533416748, |
|
"learning_rate": 7.183833116036505e-05, |
|
"loss": 0.3937, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 11.448826597131681, |
|
"grad_norm": 1.935478925704956, |
|
"learning_rate": 7.102346805736636e-05, |
|
"loss": 0.3933, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 11.489569752281616, |
|
"grad_norm": 3.882283926010132, |
|
"learning_rate": 7.020860495436766e-05, |
|
"loss": 0.3882, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 11.53031290743155, |
|
"grad_norm": 2.2980778217315674, |
|
"learning_rate": 6.939374185136897e-05, |
|
"loss": 0.3994, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 11.571056062581487, |
|
"grad_norm": 3.7042973041534424, |
|
"learning_rate": 6.857887874837027e-05, |
|
"loss": 0.3894, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 11.611799217731422, |
|
"grad_norm": 2.877511739730835, |
|
"learning_rate": 6.776401564537158e-05, |
|
"loss": 0.4033, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 11.652542372881356, |
|
"grad_norm": 3.1929280757904053, |
|
"learning_rate": 6.694915254237287e-05, |
|
"loss": 0.3913, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 11.693285528031291, |
|
"grad_norm": 2.0072107315063477, |
|
"learning_rate": 6.613428943937419e-05, |
|
"loss": 0.3971, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 11.734028683181226, |
|
"grad_norm": 1.9861186742782593, |
|
"learning_rate": 6.531942633637548e-05, |
|
"loss": 0.4003, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 11.77477183833116, |
|
"grad_norm": 2.227025032043457, |
|
"learning_rate": 6.450456323337679e-05, |
|
"loss": 0.4003, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 11.815514993481095, |
|
"grad_norm": 2.0405077934265137, |
|
"learning_rate": 6.368970013037809e-05, |
|
"loss": 0.3931, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 11.85625814863103, |
|
"grad_norm": 3.3660271167755127, |
|
"learning_rate": 6.28748370273794e-05, |
|
"loss": 0.3934, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 11.897001303780964, |
|
"grad_norm": 2.728158473968506, |
|
"learning_rate": 6.20599739243807e-05, |
|
"loss": 0.3843, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 11.937744458930899, |
|
"grad_norm": 2.6212921142578125, |
|
"learning_rate": 6.124511082138199e-05, |
|
"loss": 0.3852, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 11.978487614080834, |
|
"grad_norm": 2.473024368286133, |
|
"learning_rate": 6.0430247718383304e-05, |
|
"loss": 0.3909, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.8357429504394531, |
|
"eval_loss": 0.42502352595329285, |
|
"eval_runtime": 3.8531, |
|
"eval_samples_per_second": 646.227, |
|
"eval_steps_per_second": 80.973, |
|
"step": 147264 |
|
}, |
|
{ |
|
"epoch": 12.01923076923077, |
|
"grad_norm": 3.567250967025757, |
|
"learning_rate": 5.961538461538461e-05, |
|
"loss": 0.3983, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 12.059973924380705, |
|
"grad_norm": 1.7462067604064941, |
|
"learning_rate": 5.8800521512385915e-05, |
|
"loss": 0.3794, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 12.10071707953064, |
|
"grad_norm": 2.288787364959717, |
|
"learning_rate": 5.798565840938721e-05, |
|
"loss": 0.3925, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 12.141460234680574, |
|
"grad_norm": 3.1145968437194824, |
|
"learning_rate": 5.7170795306388525e-05, |
|
"loss": 0.3793, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 12.182203389830509, |
|
"grad_norm": 2.16363525390625, |
|
"learning_rate": 5.6355932203389824e-05, |
|
"loss": 0.3804, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 12.222946544980443, |
|
"grad_norm": 2.6342670917510986, |
|
"learning_rate": 5.554106910039113e-05, |
|
"loss": 0.3861, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 12.263689700130378, |
|
"grad_norm": 2.9809041023254395, |
|
"learning_rate": 5.4726205997392434e-05, |
|
"loss": 0.3913, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 12.304432855280313, |
|
"grad_norm": 3.3812155723571777, |
|
"learning_rate": 5.391134289439374e-05, |
|
"loss": 0.3857, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 12.345176010430247, |
|
"grad_norm": 2.890817165374756, |
|
"learning_rate": 5.309647979139504e-05, |
|
"loss": 0.393, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 12.385919165580182, |
|
"grad_norm": 3.3339462280273438, |
|
"learning_rate": 5.228161668839635e-05, |
|
"loss": 0.3873, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 12.426662320730117, |
|
"grad_norm": 2.7341129779815674, |
|
"learning_rate": 5.146675358539765e-05, |
|
"loss": 0.3876, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 12.467405475880053, |
|
"grad_norm": 3.096959114074707, |
|
"learning_rate": 5.065189048239895e-05, |
|
"loss": 0.3908, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 12.508148631029988, |
|
"grad_norm": 1.7687112092971802, |
|
"learning_rate": 4.983702737940025e-05, |
|
"loss": 0.394, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 12.548891786179922, |
|
"grad_norm": 2.028165102005005, |
|
"learning_rate": 4.902216427640156e-05, |
|
"loss": 0.3883, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 12.589634941329857, |
|
"grad_norm": 2.485379934310913, |
|
"learning_rate": 4.820730117340286e-05, |
|
"loss": 0.3855, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 12.630378096479792, |
|
"grad_norm": 1.7456655502319336, |
|
"learning_rate": 4.7392438070404173e-05, |
|
"loss": 0.3968, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 12.671121251629726, |
|
"grad_norm": 2.4976985454559326, |
|
"learning_rate": 4.657757496740547e-05, |
|
"loss": 0.3798, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 12.711864406779661, |
|
"grad_norm": 3.9520442485809326, |
|
"learning_rate": 4.576271186440678e-05, |
|
"loss": 0.3894, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 12.752607561929596, |
|
"grad_norm": 2.648386240005493, |
|
"learning_rate": 4.4947848761408075e-05, |
|
"loss": 0.3851, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 12.79335071707953, |
|
"grad_norm": 2.492152690887451, |
|
"learning_rate": 4.413298565840939e-05, |
|
"loss": 0.3822, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 12.834093872229465, |
|
"grad_norm": 2.9962518215179443, |
|
"learning_rate": 4.3318122555410686e-05, |
|
"loss": 0.3858, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 12.8748370273794, |
|
"grad_norm": 2.332040309906006, |
|
"learning_rate": 4.2503259452412e-05, |
|
"loss": 0.3925, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 12.915580182529336, |
|
"grad_norm": 2.6551926136016846, |
|
"learning_rate": 4.1688396349413296e-05, |
|
"loss": 0.3899, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 12.95632333767927, |
|
"grad_norm": 1.7805209159851074, |
|
"learning_rate": 4.08735332464146e-05, |
|
"loss": 0.3885, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 12.997066492829205, |
|
"grad_norm": 2.9438204765319824, |
|
"learning_rate": 4.00586701434159e-05, |
|
"loss": 0.3956, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.8381525874137878, |
|
"eval_loss": 0.4289119243621826, |
|
"eval_runtime": 3.8072, |
|
"eval_samples_per_second": 654.032, |
|
"eval_steps_per_second": 81.951, |
|
"step": 159536 |
|
}, |
|
{ |
|
"epoch": 13.03780964797914, |
|
"grad_norm": 2.8224751949310303, |
|
"learning_rate": 3.924380704041721e-05, |
|
"loss": 0.3818, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 13.078552803129075, |
|
"grad_norm": 2.0187859535217285, |
|
"learning_rate": 3.842894393741851e-05, |
|
"loss": 0.3791, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 13.11929595827901, |
|
"grad_norm": 3.1369576454162598, |
|
"learning_rate": 3.761408083441981e-05, |
|
"loss": 0.3799, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 13.160039113428944, |
|
"grad_norm": 2.373286485671997, |
|
"learning_rate": 3.679921773142112e-05, |
|
"loss": 0.3733, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 13.200782268578878, |
|
"grad_norm": 2.583207130432129, |
|
"learning_rate": 3.5984354628422425e-05, |
|
"loss": 0.3958, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 13.241525423728813, |
|
"grad_norm": 2.5118906497955322, |
|
"learning_rate": 3.5169491525423724e-05, |
|
"loss": 0.3915, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 13.282268578878748, |
|
"grad_norm": 5.202625751495361, |
|
"learning_rate": 3.435462842242503e-05, |
|
"loss": 0.3879, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 13.323011734028682, |
|
"grad_norm": 2.9979419708251953, |
|
"learning_rate": 3.3539765319426334e-05, |
|
"loss": 0.3737, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 13.363754889178619, |
|
"grad_norm": 2.0817720890045166, |
|
"learning_rate": 3.272490221642764e-05, |
|
"loss": 0.3912, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 13.404498044328554, |
|
"grad_norm": 2.691849946975708, |
|
"learning_rate": 3.1910039113428944e-05, |
|
"loss": 0.3689, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 13.445241199478488, |
|
"grad_norm": 2.358008861541748, |
|
"learning_rate": 3.109517601043025e-05, |
|
"loss": 0.3793, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 13.485984354628423, |
|
"grad_norm": 2.514547109603882, |
|
"learning_rate": 3.028031290743155e-05, |
|
"loss": 0.3868, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 13.526727509778357, |
|
"grad_norm": 2.5108165740966797, |
|
"learning_rate": 2.9465449804432853e-05, |
|
"loss": 0.385, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 13.567470664928292, |
|
"grad_norm": 3.075470447540283, |
|
"learning_rate": 2.8650586701434158e-05, |
|
"loss": 0.3774, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 13.608213820078227, |
|
"grad_norm": 4.978045463562012, |
|
"learning_rate": 2.7835723598435463e-05, |
|
"loss": 0.3787, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 13.648956975228161, |
|
"grad_norm": 2.699185609817505, |
|
"learning_rate": 2.7020860495436762e-05, |
|
"loss": 0.3772, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 13.689700130378096, |
|
"grad_norm": 1.7423195838928223, |
|
"learning_rate": 2.6205997392438067e-05, |
|
"loss": 0.3838, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 13.73044328552803, |
|
"grad_norm": 1.603785753250122, |
|
"learning_rate": 2.539113428943937e-05, |
|
"loss": 0.3804, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 13.771186440677965, |
|
"grad_norm": 2.6994235515594482, |
|
"learning_rate": 2.4576271186440674e-05, |
|
"loss": 0.3803, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 13.811929595827902, |
|
"grad_norm": 3.038980484008789, |
|
"learning_rate": 2.376140808344198e-05, |
|
"loss": 0.3823, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 13.852672750977836, |
|
"grad_norm": 3.170668840408325, |
|
"learning_rate": 2.294654498044328e-05, |
|
"loss": 0.3803, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 13.893415906127771, |
|
"grad_norm": 2.6691057682037354, |
|
"learning_rate": 2.2131681877444586e-05, |
|
"loss": 0.3809, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 13.934159061277706, |
|
"grad_norm": 1.789117455482483, |
|
"learning_rate": 2.131681877444589e-05, |
|
"loss": 0.3917, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 13.97490221642764, |
|
"grad_norm": 2.110405445098877, |
|
"learning_rate": 2.0501955671447193e-05, |
|
"loss": 0.3792, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.8365461826324463, |
|
"eval_loss": 0.4235801696777344, |
|
"eval_runtime": 3.8552, |
|
"eval_samples_per_second": 645.875, |
|
"eval_steps_per_second": 80.929, |
|
"step": 171808 |
|
}, |
|
{ |
|
"epoch": 14.015645371577575, |
|
"grad_norm": 3.1774420738220215, |
|
"learning_rate": 1.9687092568448498e-05, |
|
"loss": 0.3883, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 14.05638852672751, |
|
"grad_norm": 3.016127109527588, |
|
"learning_rate": 1.8872229465449803e-05, |
|
"loss": 0.3798, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 14.097131681877444, |
|
"grad_norm": 1.6009718179702759, |
|
"learning_rate": 1.8057366362451105e-05, |
|
"loss": 0.3712, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 14.137874837027379, |
|
"grad_norm": 3.2171220779418945, |
|
"learning_rate": 1.724250325945241e-05, |
|
"loss": 0.3752, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 14.178617992177314, |
|
"grad_norm": 2.144103765487671, |
|
"learning_rate": 1.6427640156453715e-05, |
|
"loss": 0.3752, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 14.219361147327248, |
|
"grad_norm": 1.7222505807876587, |
|
"learning_rate": 1.5612777053455017e-05, |
|
"loss": 0.389, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 14.260104302477185, |
|
"grad_norm": 1.7213879823684692, |
|
"learning_rate": 1.4797913950456322e-05, |
|
"loss": 0.3752, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 14.30084745762712, |
|
"grad_norm": 3.261892080307007, |
|
"learning_rate": 1.3983050847457626e-05, |
|
"loss": 0.3813, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 14.341590612777054, |
|
"grad_norm": 3.9659616947174072, |
|
"learning_rate": 1.3168187744458931e-05, |
|
"loss": 0.3846, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 14.382333767926989, |
|
"grad_norm": 2.451526403427124, |
|
"learning_rate": 1.2353324641460234e-05, |
|
"loss": 0.3762, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 14.423076923076923, |
|
"grad_norm": 3.402191638946533, |
|
"learning_rate": 1.1538461538461538e-05, |
|
"loss": 0.3783, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 14.463820078226858, |
|
"grad_norm": 1.925841212272644, |
|
"learning_rate": 1.0723598435462841e-05, |
|
"loss": 0.3673, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 14.504563233376793, |
|
"grad_norm": 2.2188963890075684, |
|
"learning_rate": 9.908735332464146e-06, |
|
"loss": 0.3754, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 14.545306388526727, |
|
"grad_norm": 2.604687213897705, |
|
"learning_rate": 9.093872229465448e-06, |
|
"loss": 0.3789, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 14.586049543676662, |
|
"grad_norm": 3.5460221767425537, |
|
"learning_rate": 8.279009126466753e-06, |
|
"loss": 0.3684, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 14.626792698826597, |
|
"grad_norm": 1.9674248695373535, |
|
"learning_rate": 7.464146023468057e-06, |
|
"loss": 0.3732, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 14.667535853976531, |
|
"grad_norm": 2.8252646923065186, |
|
"learning_rate": 6.649282920469361e-06, |
|
"loss": 0.3735, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 14.708279009126468, |
|
"grad_norm": 3.43896746635437, |
|
"learning_rate": 5.834419817470664e-06, |
|
"loss": 0.383, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 14.749022164276402, |
|
"grad_norm": 2.6553750038146973, |
|
"learning_rate": 5.019556714471968e-06, |
|
"loss": 0.3877, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 14.789765319426337, |
|
"grad_norm": 3.002777576446533, |
|
"learning_rate": 4.2046936114732716e-06, |
|
"loss": 0.3757, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 14.830508474576272, |
|
"grad_norm": 3.0359461307525635, |
|
"learning_rate": 3.389830508474576e-06, |
|
"loss": 0.3903, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 14.871251629726206, |
|
"grad_norm": 2.365903615951538, |
|
"learning_rate": 2.5749674054758798e-06, |
|
"loss": 0.3818, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 14.911994784876141, |
|
"grad_norm": 3.1819570064544678, |
|
"learning_rate": 1.7601043024771837e-06, |
|
"loss": 0.3835, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 14.952737940026076, |
|
"grad_norm": 2.5495858192443848, |
|
"learning_rate": 9.452411994784876e-07, |
|
"loss": 0.3751, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 14.99348109517601, |
|
"grad_norm": 2.4265575408935547, |
|
"learning_rate": 1.303780964797914e-07, |
|
"loss": 0.3759, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.8361445665359497, |
|
"eval_loss": 0.4246699810028076, |
|
"eval_runtime": 3.8083, |
|
"eval_samples_per_second": 653.84, |
|
"eval_steps_per_second": 81.927, |
|
"step": 184080 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"step": 184080, |
|
"total_flos": 3.921542539724851e+17, |
|
"train_loss": 0.4436988375695671, |
|
"train_runtime": 13089.0372, |
|
"train_samples_per_second": 450.035, |
|
"train_steps_per_second": 14.064 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 184080, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.921542539724851e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|