|
{ |
|
"best_metric": 1.454202651977539, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.06560818790185015, |
|
"eval_steps": 100, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0001312163758037003, |
|
"grad_norm": 4.502290725708008, |
|
"learning_rate": 5e-06, |
|
"loss": 2.7959, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0001312163758037003, |
|
"eval_loss": 3.9797542095184326, |
|
"eval_runtime": 1367.0112, |
|
"eval_samples_per_second": 9.39, |
|
"eval_steps_per_second": 2.347, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0002624327516074006, |
|
"grad_norm": 4.730992794036865, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0455, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0003936491274111009, |
|
"grad_norm": 4.7559990882873535, |
|
"learning_rate": 1.5e-05, |
|
"loss": 3.0463, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0005248655032148012, |
|
"grad_norm": 4.74609899520874, |
|
"learning_rate": 2e-05, |
|
"loss": 3.0803, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0006560818790185015, |
|
"grad_norm": 4.335945129394531, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.885, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0007872982548222018, |
|
"grad_norm": 4.600176811218262, |
|
"learning_rate": 3e-05, |
|
"loss": 3.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0009185146306259021, |
|
"grad_norm": 4.308350563049316, |
|
"learning_rate": 3.5e-05, |
|
"loss": 2.9882, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0010497310064296024, |
|
"grad_norm": 4.452569484710693, |
|
"learning_rate": 4e-05, |
|
"loss": 2.8844, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0011809473822333026, |
|
"grad_norm": 5.113404750823975, |
|
"learning_rate": 4.5e-05, |
|
"loss": 2.8185, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.001312163758037003, |
|
"grad_norm": 4.601175785064697, |
|
"learning_rate": 5e-05, |
|
"loss": 2.7809, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0014433801338407032, |
|
"grad_norm": 4.8205156326293945, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 2.514, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0015745965096444037, |
|
"grad_norm": 4.931856155395508, |
|
"learning_rate": 6e-05, |
|
"loss": 2.6665, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0017058128854481039, |
|
"grad_norm": 6.019423007965088, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 2.6067, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0018370292612518043, |
|
"grad_norm": 4.860208034515381, |
|
"learning_rate": 7e-05, |
|
"loss": 2.7381, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0019682456370555047, |
|
"grad_norm": 5.671669960021973, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 2.4756, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.002099462012859205, |
|
"grad_norm": 4.821615219116211, |
|
"learning_rate": 8e-05, |
|
"loss": 2.5178, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.002230678388662905, |
|
"grad_norm": 5.072446823120117, |
|
"learning_rate": 8.5e-05, |
|
"loss": 2.4209, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0023618947644666053, |
|
"grad_norm": 5.528189659118652, |
|
"learning_rate": 9e-05, |
|
"loss": 2.4476, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.002493111140270306, |
|
"grad_norm": 4.594228267669678, |
|
"learning_rate": 9.5e-05, |
|
"loss": 2.2791, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.002624327516074006, |
|
"grad_norm": 5.120175838470459, |
|
"learning_rate": 0.0001, |
|
"loss": 2.2744, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0027555438918777063, |
|
"grad_norm": 4.523880958557129, |
|
"learning_rate": 9.999892908320647e-05, |
|
"loss": 2.0823, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0028867602676814065, |
|
"grad_norm": 5.103494167327881, |
|
"learning_rate": 9.999571637870036e-05, |
|
"loss": 2.2586, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.003017976643485107, |
|
"grad_norm": 4.991058826446533, |
|
"learning_rate": 9.999036202410325e-05, |
|
"loss": 2.1277, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0031491930192888073, |
|
"grad_norm": 4.875056743621826, |
|
"learning_rate": 9.998286624877786e-05, |
|
"loss": 2.1464, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0032804093950925075, |
|
"grad_norm": 4.341136932373047, |
|
"learning_rate": 9.997322937381829e-05, |
|
"loss": 2.1301, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0034116257708962077, |
|
"grad_norm": 4.211912155151367, |
|
"learning_rate": 9.996145181203615e-05, |
|
"loss": 2.0398, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0035428421466999083, |
|
"grad_norm": 4.3173651695251465, |
|
"learning_rate": 9.994753406794301e-05, |
|
"loss": 2.063, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0036740585225036085, |
|
"grad_norm": 4.31871223449707, |
|
"learning_rate": 9.99314767377287e-05, |
|
"loss": 1.9695, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0038052748983073087, |
|
"grad_norm": 5.267412185668945, |
|
"learning_rate": 9.991328050923581e-05, |
|
"loss": 2.017, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.003936491274111009, |
|
"grad_norm": 4.116550922393799, |
|
"learning_rate": 9.989294616193017e-05, |
|
"loss": 1.674, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.004067707649914709, |
|
"grad_norm": 4.253255367279053, |
|
"learning_rate": 9.98704745668676e-05, |
|
"loss": 1.8298, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.00419892402571841, |
|
"grad_norm": 7.945616245269775, |
|
"learning_rate": 9.98458666866564e-05, |
|
"loss": 1.8191, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.00433014040152211, |
|
"grad_norm": 4.653039455413818, |
|
"learning_rate": 9.981912357541627e-05, |
|
"loss": 1.786, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.00446135677732581, |
|
"grad_norm": 5.207880020141602, |
|
"learning_rate": 9.97902463787331e-05, |
|
"loss": 1.9124, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.004592573153129511, |
|
"grad_norm": 4.892001152038574, |
|
"learning_rate": 9.975923633360985e-05, |
|
"loss": 1.5737, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0047237895289332105, |
|
"grad_norm": 4.775864124298096, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 1.7467, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.004855005904736911, |
|
"grad_norm": 5.307229518890381, |
|
"learning_rate": 9.969082310281891e-05, |
|
"loss": 1.5144, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.004986222280540612, |
|
"grad_norm": 5.3355560302734375, |
|
"learning_rate": 9.965342284774632e-05, |
|
"loss": 1.576, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0051174386563443116, |
|
"grad_norm": 4.783757209777832, |
|
"learning_rate": 9.961389560529836e-05, |
|
"loss": 1.3935, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.005248655032148012, |
|
"grad_norm": 4.547131061553955, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 1.4015, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.005379871407951712, |
|
"grad_norm": 6.1063055992126465, |
|
"learning_rate": 9.952846702217886e-05, |
|
"loss": 1.402, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.005511087783755413, |
|
"grad_norm": 4.746460437774658, |
|
"learning_rate": 9.948256934098352e-05, |
|
"loss": 1.1739, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.005642304159559113, |
|
"grad_norm": 4.250523567199707, |
|
"learning_rate": 9.943455199120837e-05, |
|
"loss": 1.2131, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.005773520535362813, |
|
"grad_norm": 5.288604736328125, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 1.1556, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.005904736911166514, |
|
"grad_norm": 5.479332447052002, |
|
"learning_rate": 9.933216660424395e-05, |
|
"loss": 1.2704, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.006035953286970214, |
|
"grad_norm": 4.803893566131592, |
|
"learning_rate": 9.927780295290389e-05, |
|
"loss": 1.0887, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.006167169662773914, |
|
"grad_norm": 4.377706050872803, |
|
"learning_rate": 9.922132840449459e-05, |
|
"loss": 0.8161, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.006298386038577615, |
|
"grad_norm": 4.871409893035889, |
|
"learning_rate": 9.916274537819775e-05, |
|
"loss": 0.8478, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.006429602414381314, |
|
"grad_norm": 4.593444347381592, |
|
"learning_rate": 9.91020563835152e-05, |
|
"loss": 0.9841, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.006560818790185015, |
|
"grad_norm": 6.342800617218018, |
|
"learning_rate": 9.903926402016153e-05, |
|
"loss": 0.8137, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.006692035165988716, |
|
"grad_norm": 3.515503406524658, |
|
"learning_rate": 9.897437097795257e-05, |
|
"loss": 2.5555, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.006823251541792415, |
|
"grad_norm": 2.8081209659576416, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 2.4244, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.006954467917596116, |
|
"grad_norm": 2.0723488330841064, |
|
"learning_rate": 9.883829406604363e-05, |
|
"loss": 2.255, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.007085684293399817, |
|
"grad_norm": 1.63498055934906, |
|
"learning_rate": 9.876711602542563e-05, |
|
"loss": 2.219, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0072169006692035164, |
|
"grad_norm": 1.5121456384658813, |
|
"learning_rate": 9.869384896386668e-05, |
|
"loss": 2.2263, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.007348117045007217, |
|
"grad_norm": 1.7543376684188843, |
|
"learning_rate": 9.861849601988383e-05, |
|
"loss": 2.2319, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.007479333420810917, |
|
"grad_norm": 2.030773639678955, |
|
"learning_rate": 9.854106042134641e-05, |
|
"loss": 2.299, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0076105497966146175, |
|
"grad_norm": 2.207674264907837, |
|
"learning_rate": 9.846154548533773e-05, |
|
"loss": 2.3749, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.007741766172418318, |
|
"grad_norm": 2.266341209411621, |
|
"learning_rate": 9.837995461801299e-05, |
|
"loss": 2.3051, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.007872982548222019, |
|
"grad_norm": 2.5948050022125244, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 2.2048, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.008004198924025718, |
|
"grad_norm": 3.1021060943603516, |
|
"learning_rate": 9.821055915851647e-05, |
|
"loss": 2.3619, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.008135415299829418, |
|
"grad_norm": 3.0172550678253174, |
|
"learning_rate": 9.812276182268236e-05, |
|
"loss": 2.198, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.008266631675633119, |
|
"grad_norm": 4.317417144775391, |
|
"learning_rate": 9.803290306789676e-05, |
|
"loss": 2.1099, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.00839784805143682, |
|
"grad_norm": 4.255865097045898, |
|
"learning_rate": 9.794098674340965e-05, |
|
"loss": 2.2599, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.00852906442724052, |
|
"grad_norm": 3.712496280670166, |
|
"learning_rate": 9.784701678661045e-05, |
|
"loss": 2.0834, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.00866028080304422, |
|
"grad_norm": 5.127752304077148, |
|
"learning_rate": 9.775099722285935e-05, |
|
"loss": 2.082, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.00879149717884792, |
|
"grad_norm": 3.70094633102417, |
|
"learning_rate": 9.765293216531486e-05, |
|
"loss": 2.2118, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.00892271355465162, |
|
"grad_norm": 3.54309344291687, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 1.8701, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.009053929930455321, |
|
"grad_norm": 4.72603702545166, |
|
"learning_rate": 9.74506824594107e-05, |
|
"loss": 2.1741, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.009185146306259022, |
|
"grad_norm": 4.20818567276001, |
|
"learning_rate": 9.73465064747553e-05, |
|
"loss": 1.9426, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.009316362682062722, |
|
"grad_norm": 3.797800064086914, |
|
"learning_rate": 9.724030232334391e-05, |
|
"loss": 1.8893, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.009447579057866421, |
|
"grad_norm": 4.630662441253662, |
|
"learning_rate": 9.713207455460894e-05, |
|
"loss": 1.91, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.009578795433670122, |
|
"grad_norm": 4.435169696807861, |
|
"learning_rate": 9.702182780466775e-05, |
|
"loss": 1.8245, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.009710011809473822, |
|
"grad_norm": 4.108457088470459, |
|
"learning_rate": 9.690956679612421e-05, |
|
"loss": 1.8773, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.009841228185277523, |
|
"grad_norm": 3.7498910427093506, |
|
"learning_rate": 9.67952963378663e-05, |
|
"loss": 1.5919, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.009972444561081224, |
|
"grad_norm": 3.4361634254455566, |
|
"learning_rate": 9.667902132486009e-05, |
|
"loss": 1.5667, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.010103660936884922, |
|
"grad_norm": 3.993279218673706, |
|
"learning_rate": 9.656074673794018e-05, |
|
"loss": 1.8395, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.010234877312688623, |
|
"grad_norm": 3.958552598953247, |
|
"learning_rate": 9.644047764359622e-05, |
|
"loss": 1.7246, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.010366093688492324, |
|
"grad_norm": 4.14636754989624, |
|
"learning_rate": 9.631821919375591e-05, |
|
"loss": 1.6861, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.010497310064296024, |
|
"grad_norm": 4.084466934204102, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 1.4632, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.010628526440099725, |
|
"grad_norm": 3.8091354370117188, |
|
"learning_rate": 9.606775526115963e-05, |
|
"loss": 1.6115, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.010759742815903424, |
|
"grad_norm": 3.6534502506256104, |
|
"learning_rate": 9.593956050744492e-05, |
|
"loss": 1.4976, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.010890959191707125, |
|
"grad_norm": 4.479729652404785, |
|
"learning_rate": 9.580939785585681e-05, |
|
"loss": 1.6682, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.011022175567510825, |
|
"grad_norm": 4.333984375, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 1.7758, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.011153391943314526, |
|
"grad_norm": 5.193741321563721, |
|
"learning_rate": 9.554319124605879e-05, |
|
"loss": 1.382, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.011284608319118226, |
|
"grad_norm": 4.566789627075195, |
|
"learning_rate": 9.540715869125407e-05, |
|
"loss": 1.5179, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.011415824694921927, |
|
"grad_norm": 4.185393810272217, |
|
"learning_rate": 9.526918104489777e-05, |
|
"loss": 1.3469, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.011547041070725626, |
|
"grad_norm": 4.056511878967285, |
|
"learning_rate": 9.512926421749304e-05, |
|
"loss": 1.2551, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.011678257446529327, |
|
"grad_norm": 4.229364395141602, |
|
"learning_rate": 9.498741420261108e-05, |
|
"loss": 1.4315, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.011809473822333027, |
|
"grad_norm": 3.8965346813201904, |
|
"learning_rate": 9.484363707663442e-05, |
|
"loss": 1.4387, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.011940690198136728, |
|
"grad_norm": 4.694686412811279, |
|
"learning_rate": 9.469793899849661e-05, |
|
"loss": 1.5771, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.012071906573940428, |
|
"grad_norm": 5.034073352813721, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 1.6219, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.012203122949744127, |
|
"grad_norm": 4.071391582489014, |
|
"learning_rate": 9.440080503264037e-05, |
|
"loss": 1.0868, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.012334339325547828, |
|
"grad_norm": 4.7768168449401855, |
|
"learning_rate": 9.42493818731521e-05, |
|
"loss": 1.2184, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.012465555701351529, |
|
"grad_norm": 4.308196067810059, |
|
"learning_rate": 9.409606321741775e-05, |
|
"loss": 0.852, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.01259677207715523, |
|
"grad_norm": 3.9685397148132324, |
|
"learning_rate": 9.394085563309827e-05, |
|
"loss": 1.0385, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.01272798845295893, |
|
"grad_norm": 3.9740753173828125, |
|
"learning_rate": 9.378376576876999e-05, |
|
"loss": 0.9448, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.012859204828762629, |
|
"grad_norm": 3.3262381553649902, |
|
"learning_rate": 9.362480035363986e-05, |
|
"loss": 0.6915, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.01299042120456633, |
|
"grad_norm": 5.052366256713867, |
|
"learning_rate": 9.34639661972572e-05, |
|
"loss": 0.8459, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.01312163758037003, |
|
"grad_norm": 4.026389122009277, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.6136, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01312163758037003, |
|
"eval_loss": 1.995261549949646, |
|
"eval_runtime": 1352.3203, |
|
"eval_samples_per_second": 9.492, |
|
"eval_steps_per_second": 2.373, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01325285395617373, |
|
"grad_norm": 3.1780271530151367, |
|
"learning_rate": 9.31367192988896e-05, |
|
"loss": 2.3667, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.013384070331977431, |
|
"grad_norm": 2.605407238006592, |
|
"learning_rate": 9.297032057507264e-05, |
|
"loss": 2.2015, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.013515286707781132, |
|
"grad_norm": 1.9591269493103027, |
|
"learning_rate": 9.280208114573859e-05, |
|
"loss": 2.0853, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.01364650308358483, |
|
"grad_norm": 1.4972039461135864, |
|
"learning_rate": 9.263200821770461e-05, |
|
"loss": 2.0733, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.013777719459388531, |
|
"grad_norm": 1.6665318012237549, |
|
"learning_rate": 9.246010907632895e-05, |
|
"loss": 2.0467, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.013908935835192232, |
|
"grad_norm": 1.6789878606796265, |
|
"learning_rate": 9.228639108519868e-05, |
|
"loss": 2.1175, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.014040152210995933, |
|
"grad_norm": 2.0789976119995117, |
|
"learning_rate": 9.211086168581433e-05, |
|
"loss": 2.2926, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.014171368586799633, |
|
"grad_norm": 2.8981029987335205, |
|
"learning_rate": 9.193352839727121e-05, |
|
"loss": 2.1756, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.014302584962603332, |
|
"grad_norm": 2.743936538696289, |
|
"learning_rate": 9.175439881593716e-05, |
|
"loss": 2.1466, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.014433801338407033, |
|
"grad_norm": 3.987334728240967, |
|
"learning_rate": 9.157348061512727e-05, |
|
"loss": 2.2559, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.014565017714210734, |
|
"grad_norm": 3.1965720653533936, |
|
"learning_rate": 9.139078154477512e-05, |
|
"loss": 2.2562, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.014696234090014434, |
|
"grad_norm": 3.892089605331421, |
|
"learning_rate": 9.120630943110077e-05, |
|
"loss": 2.1416, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.014827450465818135, |
|
"grad_norm": 3.2245702743530273, |
|
"learning_rate": 9.102007217627568e-05, |
|
"loss": 2.0573, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.014958666841621834, |
|
"grad_norm": 3.0087993144989014, |
|
"learning_rate": 9.083207775808396e-05, |
|
"loss": 1.9645, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.015089883217425534, |
|
"grad_norm": 2.868622303009033, |
|
"learning_rate": 9.064233422958077e-05, |
|
"loss": 2.0366, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.015221099593229235, |
|
"grad_norm": 3.084916830062866, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 2.0537, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.015352315969032936, |
|
"grad_norm": 3.2997586727142334, |
|
"learning_rate": 9.025763242814291e-05, |
|
"loss": 2.0362, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.015483532344836636, |
|
"grad_norm": 4.677249431610107, |
|
"learning_rate": 9.006269063455304e-05, |
|
"loss": 1.8742, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.015614748720640335, |
|
"grad_norm": 4.065469741821289, |
|
"learning_rate": 8.986603268863536e-05, |
|
"loss": 1.7975, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.015745965096444037, |
|
"grad_norm": 3.6069369316101074, |
|
"learning_rate": 8.966766701456177e-05, |
|
"loss": 1.7775, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.015877181472247738, |
|
"grad_norm": 4.557692527770996, |
|
"learning_rate": 8.94676021096575e-05, |
|
"loss": 1.8745, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.016008397848051435, |
|
"grad_norm": 3.5455398559570312, |
|
"learning_rate": 8.926584654403724e-05, |
|
"loss": 1.7982, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.016139614223855136, |
|
"grad_norm": 3.4866509437561035, |
|
"learning_rate": 8.906240896023794e-05, |
|
"loss": 1.7942, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.016270830599658836, |
|
"grad_norm": 3.9673101902008057, |
|
"learning_rate": 8.885729807284856e-05, |
|
"loss": 1.6027, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.016402046975462537, |
|
"grad_norm": 4.192202568054199, |
|
"learning_rate": 8.865052266813685e-05, |
|
"loss": 1.7069, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.016533263351266238, |
|
"grad_norm": 3.4886741638183594, |
|
"learning_rate": 8.844209160367299e-05, |
|
"loss": 1.6127, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.01666447972706994, |
|
"grad_norm": 4.3129448890686035, |
|
"learning_rate": 8.823201380795001e-05, |
|
"loss": 1.8012, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.01679569610287364, |
|
"grad_norm": 4.046361923217773, |
|
"learning_rate": 8.802029828000156e-05, |
|
"loss": 1.6391, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.01692691247867734, |
|
"grad_norm": 3.780560255050659, |
|
"learning_rate": 8.780695408901613e-05, |
|
"loss": 1.8057, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.01705812885448104, |
|
"grad_norm": 4.130734443664551, |
|
"learning_rate": 8.759199037394887e-05, |
|
"loss": 1.7628, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.01718934523028474, |
|
"grad_norm": 4.335687160491943, |
|
"learning_rate": 8.737541634312985e-05, |
|
"loss": 1.7281, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.01732056160608844, |
|
"grad_norm": 3.8232834339141846, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 1.6856, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.01745177798189214, |
|
"grad_norm": 4.297300338745117, |
|
"learning_rate": 8.693747451206232e-05, |
|
"loss": 1.4618, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.01758299435769584, |
|
"grad_norm": 3.4930808544158936, |
|
"learning_rate": 8.671612547178428e-05, |
|
"loss": 1.3184, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.01771421073349954, |
|
"grad_norm": 3.8762755393981934, |
|
"learning_rate": 8.649320363489179e-05, |
|
"loss": 1.4333, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.01784542710930324, |
|
"grad_norm": 4.0097126960754395, |
|
"learning_rate": 8.626871855061438e-05, |
|
"loss": 1.4535, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.01797664348510694, |
|
"grad_norm": 4.139339923858643, |
|
"learning_rate": 8.604267983514594e-05, |
|
"loss": 1.4856, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.018107859860910642, |
|
"grad_norm": 3.6388514041900635, |
|
"learning_rate": 8.581509717123273e-05, |
|
"loss": 1.3928, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.018239076236714342, |
|
"grad_norm": 3.7924964427948, |
|
"learning_rate": 8.558598030775857e-05, |
|
"loss": 1.2458, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.018370292612518043, |
|
"grad_norm": 4.4245758056640625, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 1.4169, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.018501508988321744, |
|
"grad_norm": 3.6946239471435547, |
|
"learning_rate": 8.51231833058426e-05, |
|
"loss": 1.1482, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.018632725364125444, |
|
"grad_norm": 3.711535692214966, |
|
"learning_rate": 8.488952299208401e-05, |
|
"loss": 1.065, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.01876394173992914, |
|
"grad_norm": 3.8962907791137695, |
|
"learning_rate": 8.46543681272818e-05, |
|
"loss": 1.2842, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.018895158115732842, |
|
"grad_norm": 3.2645106315612793, |
|
"learning_rate": 8.44177287846877e-05, |
|
"loss": 0.8512, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.019026374491536543, |
|
"grad_norm": 3.1213479042053223, |
|
"learning_rate": 8.417961510114356e-05, |
|
"loss": 0.7018, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.019157590867340243, |
|
"grad_norm": 4.530689716339111, |
|
"learning_rate": 8.39400372766471e-05, |
|
"loss": 0.9977, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.019288807243143944, |
|
"grad_norm": 3.922245740890503, |
|
"learning_rate": 8.36990055739149e-05, |
|
"loss": 0.6566, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.019420023618947645, |
|
"grad_norm": 4.297173500061035, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 0.7287, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.019551239994751345, |
|
"grad_norm": 4.700723648071289, |
|
"learning_rate": 8.321262189556409e-05, |
|
"loss": 0.724, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.019682456370555046, |
|
"grad_norm": 4.133970737457275, |
|
"learning_rate": 8.296729075500344e-05, |
|
"loss": 0.5965, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.019813672746358747, |
|
"grad_norm": 2.415865898132324, |
|
"learning_rate": 8.272054740543052e-05, |
|
"loss": 2.1392, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.019944889122162447, |
|
"grad_norm": 2.01712965965271, |
|
"learning_rate": 8.247240241650918e-05, |
|
"loss": 2.1176, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.020076105497966148, |
|
"grad_norm": 1.6205581426620483, |
|
"learning_rate": 8.222286641794488e-05, |
|
"loss": 2.1266, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.020207321873769845, |
|
"grad_norm": 1.8821977376937866, |
|
"learning_rate": 8.197195009902924e-05, |
|
"loss": 2.126, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.020338538249573546, |
|
"grad_norm": 1.4580323696136475, |
|
"learning_rate": 8.171966420818228e-05, |
|
"loss": 2.081, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.020469754625377246, |
|
"grad_norm": 1.5964078903198242, |
|
"learning_rate": 8.146601955249188e-05, |
|
"loss": 2.1827, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.020600971001180947, |
|
"grad_norm": 2.922008752822876, |
|
"learning_rate": 8.121102699725089e-05, |
|
"loss": 2.1915, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.020732187376984648, |
|
"grad_norm": 2.2149009704589844, |
|
"learning_rate": 8.095469746549172e-05, |
|
"loss": 2.1432, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.020863403752788348, |
|
"grad_norm": 2.5889828205108643, |
|
"learning_rate": 8.069704193751832e-05, |
|
"loss": 2.101, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.02099462012859205, |
|
"grad_norm": 3.5003163814544678, |
|
"learning_rate": 8.043807145043604e-05, |
|
"loss": 2.1131, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.02112583650439575, |
|
"grad_norm": 3.9812967777252197, |
|
"learning_rate": 8.017779709767858e-05, |
|
"loss": 2.1469, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.02125705288019945, |
|
"grad_norm": 3.2259273529052734, |
|
"learning_rate": 7.991623002853296e-05, |
|
"loss": 2.0289, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.02138826925600315, |
|
"grad_norm": 4.384939670562744, |
|
"learning_rate": 7.965338144766186e-05, |
|
"loss": 2.2525, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.021519485631806848, |
|
"grad_norm": 4.085958957672119, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 2.0116, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.02165070200761055, |
|
"grad_norm": 2.951061964035034, |
|
"learning_rate": 7.912388484339012e-05, |
|
"loss": 2.0182, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.02178191838341425, |
|
"grad_norm": 5.261083126068115, |
|
"learning_rate": 7.88572595018617e-05, |
|
"loss": 1.8649, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.02191313475921795, |
|
"grad_norm": 5.297954082489014, |
|
"learning_rate": 7.858939801138061e-05, |
|
"loss": 1.9873, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.02204435113502165, |
|
"grad_norm": 3.1123671531677246, |
|
"learning_rate": 7.832031184624164e-05, |
|
"loss": 1.9712, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.02217556751082535, |
|
"grad_norm": 3.0200247764587402, |
|
"learning_rate": 7.80500125332005e-05, |
|
"loss": 1.7175, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.02230678388662905, |
|
"grad_norm": 4.045231819152832, |
|
"learning_rate": 7.777851165098012e-05, |
|
"loss": 1.6917, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.022438000262432752, |
|
"grad_norm": 3.3924217224121094, |
|
"learning_rate": 7.750582082977467e-05, |
|
"loss": 1.915, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.022569216638236453, |
|
"grad_norm": 3.338836669921875, |
|
"learning_rate": 7.723195175075136e-05, |
|
"loss": 1.7668, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.022700433014040153, |
|
"grad_norm": 3.4897708892822266, |
|
"learning_rate": 7.695691614555003e-05, |
|
"loss": 1.7118, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.022831649389843854, |
|
"grad_norm": 3.5899102687835693, |
|
"learning_rate": 7.668072579578058e-05, |
|
"loss": 1.6634, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.02296286576564755, |
|
"grad_norm": 3.5203874111175537, |
|
"learning_rate": 7.64033925325184e-05, |
|
"loss": 1.8026, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.023094082141451252, |
|
"grad_norm": 3.14801287651062, |
|
"learning_rate": 7.612492823579745e-05, |
|
"loss": 1.6852, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.023225298517254953, |
|
"grad_norm": 3.505002021789551, |
|
"learning_rate": 7.584534483410137e-05, |
|
"loss": 1.6794, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.023356514893058653, |
|
"grad_norm": 3.6999619007110596, |
|
"learning_rate": 7.55646543038526e-05, |
|
"loss": 1.6416, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.023487731268862354, |
|
"grad_norm": 3.438927412033081, |
|
"learning_rate": 7.528286866889924e-05, |
|
"loss": 1.6118, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.023618947644666054, |
|
"grad_norm": 3.4257144927978516, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.4316, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.023750164020469755, |
|
"grad_norm": 3.3115530014038086, |
|
"learning_rate": 7.471606041430723e-05, |
|
"loss": 1.4048, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.023881380396273456, |
|
"grad_norm": 3.7471344470977783, |
|
"learning_rate": 7.443106207484776e-05, |
|
"loss": 1.4955, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.024012596772077156, |
|
"grad_norm": 3.900952100753784, |
|
"learning_rate": 7.414501719000187e-05, |
|
"loss": 1.5621, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.024143813147880857, |
|
"grad_norm": 3.704540491104126, |
|
"learning_rate": 7.385793801298042e-05, |
|
"loss": 1.4024, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.024275029523684554, |
|
"grad_norm": 3.8118085861206055, |
|
"learning_rate": 7.35698368412999e-05, |
|
"loss": 1.4171, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.024406245899488255, |
|
"grad_norm": 4.9681196212768555, |
|
"learning_rate": 7.328072601625557e-05, |
|
"loss": 1.4693, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.024537462275291955, |
|
"grad_norm": 3.6511571407318115, |
|
"learning_rate": 7.2990617922393e-05, |
|
"loss": 1.2867, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.024668678651095656, |
|
"grad_norm": 3.6810758113861084, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 1.1199, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.024799895026899357, |
|
"grad_norm": 3.7081968784332275, |
|
"learning_rate": 7.240745967946113e-05, |
|
"loss": 1.0948, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.024931111402703057, |
|
"grad_norm": 3.9994025230407715, |
|
"learning_rate": 7.211443451095007e-05, |
|
"loss": 1.3183, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.025062327778506758, |
|
"grad_norm": 3.6044254302978516, |
|
"learning_rate": 7.18204620336671e-05, |
|
"loss": 1.2563, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.02519354415431046, |
|
"grad_norm": 3.95975923538208, |
|
"learning_rate": 7.152555484041476e-05, |
|
"loss": 0.9602, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.02532476053011416, |
|
"grad_norm": 3.7815332412719727, |
|
"learning_rate": 7.122972556403567e-05, |
|
"loss": 1.0867, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.02545597690591786, |
|
"grad_norm": 3.6170129776000977, |
|
"learning_rate": 7.09329868768714e-05, |
|
"loss": 0.8663, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.02558719328172156, |
|
"grad_norm": 3.3122055530548096, |
|
"learning_rate": 7.063535149021973e-05, |
|
"loss": 0.8313, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.025718409657525258, |
|
"grad_norm": 4.299932479858398, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 1.1031, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.025849626033328958, |
|
"grad_norm": 3.6584527492523193, |
|
"learning_rate": 7.003744165515705e-05, |
|
"loss": 0.6661, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.02598084240913266, |
|
"grad_norm": 3.595172166824341, |
|
"learning_rate": 6.973719281921335e-05, |
|
"loss": 0.6774, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.02611205878493636, |
|
"grad_norm": 3.2948033809661865, |
|
"learning_rate": 6.943609850761979e-05, |
|
"loss": 0.6726, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.02624327516074006, |
|
"grad_norm": 4.285879135131836, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 0.648, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02624327516074006, |
|
"eval_loss": 1.8295915126800537, |
|
"eval_runtime": 1356.6996, |
|
"eval_samples_per_second": 9.461, |
|
"eval_steps_per_second": 2.365, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02637449153654376, |
|
"grad_norm": 2.5337905883789062, |
|
"learning_rate": 6.883142508466054e-05, |
|
"loss": 1.9931, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.02650570791234746, |
|
"grad_norm": 1.4764660596847534, |
|
"learning_rate": 6.852787187549182e-05, |
|
"loss": 2.0433, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.026636924288151162, |
|
"grad_norm": 2.1547956466674805, |
|
"learning_rate": 6.82235249939575e-05, |
|
"loss": 1.9185, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.026768140663954863, |
|
"grad_norm": 2.4460480213165283, |
|
"learning_rate": 6.7918397477265e-05, |
|
"loss": 1.9992, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.026899357039758563, |
|
"grad_norm": 1.4299720525741577, |
|
"learning_rate": 6.761250239606169e-05, |
|
"loss": 2.0654, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.027030573415562264, |
|
"grad_norm": 1.9943609237670898, |
|
"learning_rate": 6.730585285387465e-05, |
|
"loss": 2.0979, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.02716178979136596, |
|
"grad_norm": 2.2427730560302734, |
|
"learning_rate": 6.699846198654971e-05, |
|
"loss": 2.2645, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.02729300616716966, |
|
"grad_norm": 2.437592029571533, |
|
"learning_rate": 6.669034296168855e-05, |
|
"loss": 2.2168, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.027424222542973362, |
|
"grad_norm": 2.5972635746002197, |
|
"learning_rate": 6.638150897808468e-05, |
|
"loss": 2.2821, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.027555438918777063, |
|
"grad_norm": 2.6581530570983887, |
|
"learning_rate": 6.607197326515808e-05, |
|
"loss": 2.1277, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.027686655294580764, |
|
"grad_norm": 3.457130193710327, |
|
"learning_rate": 6.57617490823885e-05, |
|
"loss": 2.0846, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.027817871670384464, |
|
"grad_norm": 2.862334728240967, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 2.0455, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.027949088046188165, |
|
"grad_norm": 2.7949256896972656, |
|
"learning_rate": 6.513928849212873e-05, |
|
"loss": 1.8195, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.028080304421991865, |
|
"grad_norm": 3.603222608566284, |
|
"learning_rate": 6.482707874877854e-05, |
|
"loss": 2.1141, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.028211520797795566, |
|
"grad_norm": 3.7938568592071533, |
|
"learning_rate": 6.451423386272312e-05, |
|
"loss": 2.0341, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.028342737173599267, |
|
"grad_norm": 3.7415919303894043, |
|
"learning_rate": 6.420076723519614e-05, |
|
"loss": 1.9174, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.028473953549402964, |
|
"grad_norm": 3.2029411792755127, |
|
"learning_rate": 6.388669229406462e-05, |
|
"loss": 1.9153, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.028605169925206664, |
|
"grad_norm": 4.757354259490967, |
|
"learning_rate": 6.357202249325371e-05, |
|
"loss": 2.0522, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.028736386301010365, |
|
"grad_norm": 4.18367862701416, |
|
"learning_rate": 6.32567713121704e-05, |
|
"loss": 1.7395, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.028867602676814066, |
|
"grad_norm": 3.411064863204956, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 1.8052, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.028998819052617766, |
|
"grad_norm": 3.278550624847412, |
|
"learning_rate": 6.26245788507579e-05, |
|
"loss": 1.9089, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.029130035428421467, |
|
"grad_norm": 3.312464714050293, |
|
"learning_rate": 6.230766465144967e-05, |
|
"loss": 1.7548, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.029261251804225168, |
|
"grad_norm": 3.0991432666778564, |
|
"learning_rate": 6.199022323275083e-05, |
|
"loss": 1.6692, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.02939246818002887, |
|
"grad_norm": 3.4286837577819824, |
|
"learning_rate": 6.167226819279528e-05, |
|
"loss": 1.79, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.02952368455583257, |
|
"grad_norm": 3.0509002208709717, |
|
"learning_rate": 6.135381315171867e-05, |
|
"loss": 1.5622, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.02965490093163627, |
|
"grad_norm": 3.1480135917663574, |
|
"learning_rate": 6.103487175107507e-05, |
|
"loss": 1.622, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.02978611730743997, |
|
"grad_norm": 3.6017837524414062, |
|
"learning_rate": 6.071545765325254e-05, |
|
"loss": 1.6357, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.029917333683243667, |
|
"grad_norm": 3.5763702392578125, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 1.647, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.030048550059047368, |
|
"grad_norm": 3.3560070991516113, |
|
"learning_rate": 6.007526611628086e-05, |
|
"loss": 1.4219, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.03017976643485107, |
|
"grad_norm": 3.4218170642852783, |
|
"learning_rate": 5.9754516100806423e-05, |
|
"loss": 1.5417, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.03031098281065477, |
|
"grad_norm": 3.3482179641723633, |
|
"learning_rate": 5.9433348234327765e-05, |
|
"loss": 1.4549, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.03044219918645847, |
|
"grad_norm": 3.298544406890869, |
|
"learning_rate": 5.911177627460739e-05, |
|
"loss": 1.4727, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.03057341556226217, |
|
"grad_norm": 3.360682487487793, |
|
"learning_rate": 5.8789813996717736e-05, |
|
"loss": 1.4171, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.03070463193806587, |
|
"grad_norm": 3.3437387943267822, |
|
"learning_rate": 5.8467475192451226e-05, |
|
"loss": 1.2873, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.03083584831386957, |
|
"grad_norm": 3.6862969398498535, |
|
"learning_rate": 5.814477366972945e-05, |
|
"loss": 1.5642, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.030967064689673272, |
|
"grad_norm": 3.4449985027313232, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 1.2568, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.031098281065476973, |
|
"grad_norm": 4.007747173309326, |
|
"learning_rate": 5.749833777770225e-05, |
|
"loss": 1.1918, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.03122949744128067, |
|
"grad_norm": 3.507702589035034, |
|
"learning_rate": 5.717463109955896e-05, |
|
"loss": 1.2402, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.03136071381708437, |
|
"grad_norm": 3.6946218013763428, |
|
"learning_rate": 5.685061708409841e-05, |
|
"loss": 1.4288, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.031491930192888075, |
|
"grad_norm": 4.057668209075928, |
|
"learning_rate": 5.6526309611002594e-05, |
|
"loss": 1.1473, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.03162314656869177, |
|
"grad_norm": 3.8214073181152344, |
|
"learning_rate": 5.6201722572524275e-05, |
|
"loss": 1.1556, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.031754362944495476, |
|
"grad_norm": 3.187711000442505, |
|
"learning_rate": 5.587686987289189e-05, |
|
"loss": 0.8827, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.03188557932029917, |
|
"grad_norm": 7.194174289703369, |
|
"learning_rate": 5.5551765427713884e-05, |
|
"loss": 1.1186, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.03201679569610287, |
|
"grad_norm": 3.3873300552368164, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 0.9614, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.032148012071906575, |
|
"grad_norm": 3.6875414848327637, |
|
"learning_rate": 5.490085701647805e-05, |
|
"loss": 0.9514, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.03227922844771027, |
|
"grad_norm": 2.922820806503296, |
|
"learning_rate": 5.457508093317013e-05, |
|
"loss": 0.6782, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.032410444823513976, |
|
"grad_norm": 2.960690975189209, |
|
"learning_rate": 5.4249108868622086e-05, |
|
"loss": 0.5822, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.03254166119931767, |
|
"grad_norm": 2.816145658493042, |
|
"learning_rate": 5.392295478639225e-05, |
|
"loss": 0.5743, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.03267287757512138, |
|
"grad_norm": 3.569626569747925, |
|
"learning_rate": 5.359663265783598e-05, |
|
"loss": 0.6907, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.032804093950925074, |
|
"grad_norm": 4.098865032196045, |
|
"learning_rate": 5.327015646150716e-05, |
|
"loss": 0.7811, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.03293531032672878, |
|
"grad_norm": 1.4596612453460693, |
|
"learning_rate": 5.294354018255945e-05, |
|
"loss": 1.9946, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.033066526702532476, |
|
"grad_norm": 1.3144938945770264, |
|
"learning_rate": 5.26167978121472e-05, |
|
"loss": 1.9186, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.03319774307833618, |
|
"grad_norm": 1.3099392652511597, |
|
"learning_rate": 5.228994334682604e-05, |
|
"loss": 1.9639, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.03332895945413988, |
|
"grad_norm": 1.3149683475494385, |
|
"learning_rate": 5.196299078795344e-05, |
|
"loss": 1.9173, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.033460175829943574, |
|
"grad_norm": 1.3609029054641724, |
|
"learning_rate": 5.1635954141088813e-05, |
|
"loss": 2.0152, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.03359139220574728, |
|
"grad_norm": 1.6170399188995361, |
|
"learning_rate": 5.1308847415393666e-05, |
|
"loss": 2.0613, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.033722608581550975, |
|
"grad_norm": 2.1593005657196045, |
|
"learning_rate": 5.0981684623031415e-05, |
|
"loss": 2.0864, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.03385382495735468, |
|
"grad_norm": 2.1808066368103027, |
|
"learning_rate": 5.0654479778567223e-05, |
|
"loss": 2.0672, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.033985041333158376, |
|
"grad_norm": 2.3150367736816406, |
|
"learning_rate": 5.0327246898367597e-05, |
|
"loss": 2.0188, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.03411625770896208, |
|
"grad_norm": 2.590977430343628, |
|
"learning_rate": 5e-05, |
|
"loss": 2.1361, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.03424747408476578, |
|
"grad_norm": 2.5015792846679688, |
|
"learning_rate": 4.9672753101632415e-05, |
|
"loss": 1.9808, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.03437869046056948, |
|
"grad_norm": 3.2308530807495117, |
|
"learning_rate": 4.934552022143279e-05, |
|
"loss": 2.0869, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.03450990683637318, |
|
"grad_norm": 3.2150726318359375, |
|
"learning_rate": 4.901831537696859e-05, |
|
"loss": 1.9293, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.03464112321217688, |
|
"grad_norm": 2.861090898513794, |
|
"learning_rate": 4.869115258460635e-05, |
|
"loss": 2.0099, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.03477233958798058, |
|
"grad_norm": 2.8258917331695557, |
|
"learning_rate": 4.83640458589112e-05, |
|
"loss": 1.8759, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.03490355596378428, |
|
"grad_norm": 2.8827760219573975, |
|
"learning_rate": 4.8037009212046586e-05, |
|
"loss": 1.8166, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.03503477233958798, |
|
"grad_norm": 2.917901039123535, |
|
"learning_rate": 4.7710056653173976e-05, |
|
"loss": 1.8738, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.03516598871539168, |
|
"grad_norm": 3.3789291381835938, |
|
"learning_rate": 4.738320218785281e-05, |
|
"loss": 1.8185, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.03529720509119538, |
|
"grad_norm": 3.0679268836975098, |
|
"learning_rate": 4.7056459817440544e-05, |
|
"loss": 1.7584, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.03542842146699908, |
|
"grad_norm": 3.146171808242798, |
|
"learning_rate": 4.6729843538492847e-05, |
|
"loss": 1.7729, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.035559637842802784, |
|
"grad_norm": 3.02197265625, |
|
"learning_rate": 4.640336734216403e-05, |
|
"loss": 1.7293, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.03569085421860648, |
|
"grad_norm": 3.030005931854248, |
|
"learning_rate": 4.607704521360776e-05, |
|
"loss": 1.7331, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.035822070594410185, |
|
"grad_norm": 2.7871899604797363, |
|
"learning_rate": 4.575089113137792e-05, |
|
"loss": 1.5662, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.03595328697021388, |
|
"grad_norm": 3.5039308071136475, |
|
"learning_rate": 4.542491906682989e-05, |
|
"loss": 1.7353, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.03608450334601758, |
|
"grad_norm": 3.1538591384887695, |
|
"learning_rate": 4.509914298352197e-05, |
|
"loss": 1.5676, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.036215719721821284, |
|
"grad_norm": 2.9758079051971436, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 1.6357, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.03634693609762498, |
|
"grad_norm": 3.1648526191711426, |
|
"learning_rate": 4.444823457228612e-05, |
|
"loss": 1.5309, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.036478152473428685, |
|
"grad_norm": 3.3066608905792236, |
|
"learning_rate": 4.412313012710813e-05, |
|
"loss": 1.572, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.03660936884923238, |
|
"grad_norm": 3.489302635192871, |
|
"learning_rate": 4.379827742747575e-05, |
|
"loss": 1.6067, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.036740585225036086, |
|
"grad_norm": 3.4767446517944336, |
|
"learning_rate": 4.347369038899744e-05, |
|
"loss": 1.6319, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.03687180160083978, |
|
"grad_norm": 3.145461320877075, |
|
"learning_rate": 4.3149382915901606e-05, |
|
"loss": 1.4674, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.03700301797664349, |
|
"grad_norm": 3.5818727016448975, |
|
"learning_rate": 4.282536890044104e-05, |
|
"loss": 1.566, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.037134234352447185, |
|
"grad_norm": 3.861572027206421, |
|
"learning_rate": 4.250166222229774e-05, |
|
"loss": 1.5454, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.03726545072825089, |
|
"grad_norm": 3.507399320602417, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 1.3924, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.037396667104054586, |
|
"grad_norm": 3.6404268741607666, |
|
"learning_rate": 4.185522633027057e-05, |
|
"loss": 1.4609, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.03752788347985828, |
|
"grad_norm": 3.455463171005249, |
|
"learning_rate": 4.153252480754877e-05, |
|
"loss": 1.293, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.03765909985566199, |
|
"grad_norm": 3.82511305809021, |
|
"learning_rate": 4.1210186003282275e-05, |
|
"loss": 1.3777, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.037790316231465684, |
|
"grad_norm": 4.107599258422852, |
|
"learning_rate": 4.088822372539263e-05, |
|
"loss": 1.4122, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.03792153260726939, |
|
"grad_norm": 3.8363401889801025, |
|
"learning_rate": 4.0566651765672246e-05, |
|
"loss": 1.1753, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.038052748983073086, |
|
"grad_norm": 3.759277105331421, |
|
"learning_rate": 4.0245483899193595e-05, |
|
"loss": 1.2129, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.03818396535887679, |
|
"grad_norm": 3.4467883110046387, |
|
"learning_rate": 3.992473388371915e-05, |
|
"loss": 1.1519, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.03831518173468049, |
|
"grad_norm": 3.933537483215332, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 1.148, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.03844639811048419, |
|
"grad_norm": 3.5908944606781006, |
|
"learning_rate": 3.928454234674747e-05, |
|
"loss": 1.0006, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.03857761448628789, |
|
"grad_norm": 3.9158682823181152, |
|
"learning_rate": 3.896512824892495e-05, |
|
"loss": 1.1491, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.03870883086209159, |
|
"grad_norm": 3.2645325660705566, |
|
"learning_rate": 3.864618684828134e-05, |
|
"loss": 0.6771, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.03884004723789529, |
|
"grad_norm": 3.201977252960205, |
|
"learning_rate": 3.832773180720475e-05, |
|
"loss": 0.6873, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.038971263613698987, |
|
"grad_norm": 3.315368175506592, |
|
"learning_rate": 3.800977676724919e-05, |
|
"loss": 0.6829, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.03910247998950269, |
|
"grad_norm": 3.72304368019104, |
|
"learning_rate": 3.769233534855035e-05, |
|
"loss": 0.8417, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.03923369636530639, |
|
"grad_norm": 3.146021842956543, |
|
"learning_rate": 3.73754211492421e-05, |
|
"loss": 0.5809, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.03936491274111009, |
|
"grad_norm": 3.441509485244751, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 0.5907, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03936491274111009, |
|
"eval_loss": 1.6083000898361206, |
|
"eval_runtime": 1364.8957, |
|
"eval_samples_per_second": 9.404, |
|
"eval_steps_per_second": 2.351, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03949612911691379, |
|
"grad_norm": 1.1431409120559692, |
|
"learning_rate": 3.6743228687829595e-05, |
|
"loss": 1.9111, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.03962734549271749, |
|
"grad_norm": 1.1873606443405151, |
|
"learning_rate": 3.642797750674629e-05, |
|
"loss": 1.8401, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.03975856186852119, |
|
"grad_norm": 1.1036310195922852, |
|
"learning_rate": 3.6113307705935396e-05, |
|
"loss": 1.9566, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.039889778244324894, |
|
"grad_norm": 1.2092435359954834, |
|
"learning_rate": 3.579923276480387e-05, |
|
"loss": 1.9171, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.04002099462012859, |
|
"grad_norm": 1.284285545349121, |
|
"learning_rate": 3.5485766137276894e-05, |
|
"loss": 1.8699, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.040152210995932296, |
|
"grad_norm": 1.4310280084609985, |
|
"learning_rate": 3.5172921251221455e-05, |
|
"loss": 1.9898, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.04028342737173599, |
|
"grad_norm": 1.8922241926193237, |
|
"learning_rate": 3.486071150787128e-05, |
|
"loss": 2.0229, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.04041464374753969, |
|
"grad_norm": 2.3455190658569336, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 2.1236, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.040545860123343394, |
|
"grad_norm": 2.310145616531372, |
|
"learning_rate": 3.423825091761153e-05, |
|
"loss": 2.0076, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.04067707649914709, |
|
"grad_norm": 2.770493507385254, |
|
"learning_rate": 3.392802673484193e-05, |
|
"loss": 2.1042, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.040808292874950795, |
|
"grad_norm": 2.8514256477355957, |
|
"learning_rate": 3.361849102191533e-05, |
|
"loss": 2.1725, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.04093950925075449, |
|
"grad_norm": 2.9664652347564697, |
|
"learning_rate": 3.330965703831146e-05, |
|
"loss": 1.9951, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.0410707256265582, |
|
"grad_norm": 3.0636472702026367, |
|
"learning_rate": 3.300153801345028e-05, |
|
"loss": 2.0478, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.041201942002361894, |
|
"grad_norm": 3.0987637042999268, |
|
"learning_rate": 3.2694147146125345e-05, |
|
"loss": 1.8762, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.0413331583781656, |
|
"grad_norm": 3.3828492164611816, |
|
"learning_rate": 3.2387497603938326e-05, |
|
"loss": 2.0024, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.041464374753969295, |
|
"grad_norm": 3.3837296962738037, |
|
"learning_rate": 3.2081602522734986e-05, |
|
"loss": 1.8804, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.041595591129773, |
|
"grad_norm": 3.1520280838012695, |
|
"learning_rate": 3.177647500604252e-05, |
|
"loss": 1.7544, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.041726807505576696, |
|
"grad_norm": 3.064986228942871, |
|
"learning_rate": 3.147212812450819e-05, |
|
"loss": 1.7702, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.04185802388138039, |
|
"grad_norm": 2.9096457958221436, |
|
"learning_rate": 3.116857491533947e-05, |
|
"loss": 1.7361, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.0419892402571841, |
|
"grad_norm": 3.204127073287964, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 2.0178, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.042120456632987795, |
|
"grad_norm": 3.014491081237793, |
|
"learning_rate": 3.056390149238022e-05, |
|
"loss": 1.6361, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.0422516730087915, |
|
"grad_norm": 2.9719576835632324, |
|
"learning_rate": 3.0262807180786647e-05, |
|
"loss": 1.5053, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.042382889384595196, |
|
"grad_norm": 2.9963722229003906, |
|
"learning_rate": 2.996255834484296e-05, |
|
"loss": 1.6507, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.0425141057603989, |
|
"grad_norm": 3.126011848449707, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 1.6596, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.0426453221362026, |
|
"grad_norm": 3.2865161895751953, |
|
"learning_rate": 2.936464850978027e-05, |
|
"loss": 1.742, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.0427765385120063, |
|
"grad_norm": 3.1985416412353516, |
|
"learning_rate": 2.9067013123128613e-05, |
|
"loss": 1.592, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.04290775488781, |
|
"grad_norm": 3.0526158809661865, |
|
"learning_rate": 2.8770274435964355e-05, |
|
"loss": 1.6197, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.043038971263613696, |
|
"grad_norm": 3.0706164836883545, |
|
"learning_rate": 2.8474445159585235e-05, |
|
"loss": 1.6091, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.0431701876394174, |
|
"grad_norm": 3.1661648750305176, |
|
"learning_rate": 2.8179537966332887e-05, |
|
"loss": 1.4687, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.0433014040152211, |
|
"grad_norm": 3.272674798965454, |
|
"learning_rate": 2.7885565489049946e-05, |
|
"loss": 1.5756, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0434326203910248, |
|
"grad_norm": 3.0554146766662598, |
|
"learning_rate": 2.759254032053888e-05, |
|
"loss": 1.3903, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.0435638367668285, |
|
"grad_norm": 3.462747097015381, |
|
"learning_rate": 2.7300475013022663e-05, |
|
"loss": 1.433, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.0436950531426322, |
|
"grad_norm": 3.4860434532165527, |
|
"learning_rate": 2.700938207760701e-05, |
|
"loss": 1.4024, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.0438262695184359, |
|
"grad_norm": 3.317476987838745, |
|
"learning_rate": 2.671927398374443e-05, |
|
"loss": 1.3322, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.043957485894239604, |
|
"grad_norm": 3.5378825664520264, |
|
"learning_rate": 2.6430163158700115e-05, |
|
"loss": 1.3707, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.0440887022700433, |
|
"grad_norm": 3.7148430347442627, |
|
"learning_rate": 2.6142061987019577e-05, |
|
"loss": 1.4425, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.044219918645847005, |
|
"grad_norm": 3.060731887817383, |
|
"learning_rate": 2.5854982809998153e-05, |
|
"loss": 1.1957, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.0443511350216507, |
|
"grad_norm": 3.4139750003814697, |
|
"learning_rate": 2.556893792515227e-05, |
|
"loss": 1.3196, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.0444823513974544, |
|
"grad_norm": 3.1842236518859863, |
|
"learning_rate": 2.5283939585692783e-05, |
|
"loss": 1.182, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.0446135677732581, |
|
"grad_norm": 3.5313189029693604, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 1.1865, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0447447841490618, |
|
"grad_norm": 3.486128091812134, |
|
"learning_rate": 2.471713133110078e-05, |
|
"loss": 1.1062, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.044876000524865504, |
|
"grad_norm": 3.0605080127716064, |
|
"learning_rate": 2.4435345696147403e-05, |
|
"loss": 0.8957, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.0450072169006692, |
|
"grad_norm": 3.2468960285186768, |
|
"learning_rate": 2.4154655165898627e-05, |
|
"loss": 0.9158, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.045138433276472906, |
|
"grad_norm": 3.440025568008423, |
|
"learning_rate": 2.3875071764202563e-05, |
|
"loss": 0.8078, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.0452696496522766, |
|
"grad_norm": 2.9829273223876953, |
|
"learning_rate": 2.3596607467481603e-05, |
|
"loss": 0.7363, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.04540086602808031, |
|
"grad_norm": 3.14467453956604, |
|
"learning_rate": 2.3319274204219428e-05, |
|
"loss": 0.5925, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.045532082403884004, |
|
"grad_norm": 3.064061403274536, |
|
"learning_rate": 2.3043083854449988e-05, |
|
"loss": 0.6579, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.04566329877968771, |
|
"grad_norm": 2.8710074424743652, |
|
"learning_rate": 2.2768048249248648e-05, |
|
"loss": 0.5748, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.045794515155491405, |
|
"grad_norm": 3.819432020187378, |
|
"learning_rate": 2.2494179170225333e-05, |
|
"loss": 0.6714, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.0459257315312951, |
|
"grad_norm": 4.986090183258057, |
|
"learning_rate": 2.2221488349019903e-05, |
|
"loss": 0.7477, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.04605694790709881, |
|
"grad_norm": 2.6902003288269043, |
|
"learning_rate": 2.194998746679952e-05, |
|
"loss": 1.8673, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.046188164282902504, |
|
"grad_norm": 2.166841506958008, |
|
"learning_rate": 2.167968815375837e-05, |
|
"loss": 1.8827, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.04631938065870621, |
|
"grad_norm": 1.7183693647384644, |
|
"learning_rate": 2.1410601988619394e-05, |
|
"loss": 1.8546, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.046450597034509905, |
|
"grad_norm": 1.265217900276184, |
|
"learning_rate": 2.1142740498138324e-05, |
|
"loss": 1.9024, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.04658181341031361, |
|
"grad_norm": 1.721006155014038, |
|
"learning_rate": 2.08761151566099e-05, |
|
"loss": 1.955, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.046713029786117306, |
|
"grad_norm": 2.1906604766845703, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 2.01, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.04684424616192101, |
|
"grad_norm": 2.1465506553649902, |
|
"learning_rate": 2.034661855233815e-05, |
|
"loss": 2.0038, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.04697546253772471, |
|
"grad_norm": 2.263490915298462, |
|
"learning_rate": 2.008376997146705e-05, |
|
"loss": 2.068, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.04710667891352841, |
|
"grad_norm": 2.7283573150634766, |
|
"learning_rate": 1.982220290232143e-05, |
|
"loss": 2.0493, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.04723789528933211, |
|
"grad_norm": 2.6310789585113525, |
|
"learning_rate": 1.9561928549563968e-05, |
|
"loss": 2.069, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.047369111665135806, |
|
"grad_norm": 2.767486333847046, |
|
"learning_rate": 1.9302958062481673e-05, |
|
"loss": 2.028, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.04750032804093951, |
|
"grad_norm": 2.8701977729797363, |
|
"learning_rate": 1.9045302534508297e-05, |
|
"loss": 1.9971, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.04763154441674321, |
|
"grad_norm": 3.023191213607788, |
|
"learning_rate": 1.8788973002749112e-05, |
|
"loss": 1.9896, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.04776276079254691, |
|
"grad_norm": 3.0075771808624268, |
|
"learning_rate": 1.8533980447508137e-05, |
|
"loss": 1.9956, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.04789397716835061, |
|
"grad_norm": 3.155802011489868, |
|
"learning_rate": 1.8280335791817733e-05, |
|
"loss": 1.7729, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.04802519354415431, |
|
"grad_norm": 3.0290050506591797, |
|
"learning_rate": 1.8028049900970767e-05, |
|
"loss": 1.7798, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.04815640991995801, |
|
"grad_norm": 2.8980050086975098, |
|
"learning_rate": 1.777713358205514e-05, |
|
"loss": 1.7175, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.048287626295761714, |
|
"grad_norm": 3.2710909843444824, |
|
"learning_rate": 1.7527597583490822e-05, |
|
"loss": 1.829, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.04841884267156541, |
|
"grad_norm": 3.313262462615967, |
|
"learning_rate": 1.7279452594569483e-05, |
|
"loss": 1.8382, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.04855005904736911, |
|
"grad_norm": 3.1630051136016846, |
|
"learning_rate": 1.703270924499656e-05, |
|
"loss": 1.7057, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.04868127542317281, |
|
"grad_norm": 3.111182928085327, |
|
"learning_rate": 1.678737810443593e-05, |
|
"loss": 1.6428, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.04881249179897651, |
|
"grad_norm": 3.932502508163452, |
|
"learning_rate": 1.6543469682057106e-05, |
|
"loss": 1.7103, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.048943708174780214, |
|
"grad_norm": 3.1841988563537598, |
|
"learning_rate": 1.6300994426085103e-05, |
|
"loss": 1.5631, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.04907492455058391, |
|
"grad_norm": 3.1388206481933594, |
|
"learning_rate": 1.605996272335291e-05, |
|
"loss": 1.6667, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.049206140926387615, |
|
"grad_norm": 3.2520394325256348, |
|
"learning_rate": 1.5820384898856434e-05, |
|
"loss": 1.5319, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.04933735730219131, |
|
"grad_norm": 3.448615074157715, |
|
"learning_rate": 1.5582271215312294e-05, |
|
"loss": 1.3435, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.049468573677995016, |
|
"grad_norm": 3.1773369312286377, |
|
"learning_rate": 1.5345631872718214e-05, |
|
"loss": 1.4977, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.04959979005379871, |
|
"grad_norm": 3.773745536804199, |
|
"learning_rate": 1.5110477007916001e-05, |
|
"loss": 1.6206, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.04973100642960242, |
|
"grad_norm": 3.2866322994232178, |
|
"learning_rate": 1.4876816694157419e-05, |
|
"loss": 1.4102, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.049862222805406115, |
|
"grad_norm": 3.218993902206421, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 1.4885, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.04999343918120981, |
|
"grad_norm": 3.413767099380493, |
|
"learning_rate": 1.4414019692241437e-05, |
|
"loss": 1.4064, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.050124655557013516, |
|
"grad_norm": 3.1845431327819824, |
|
"learning_rate": 1.4184902828767287e-05, |
|
"loss": 1.3509, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.05025587193281721, |
|
"grad_norm": 3.4849841594696045, |
|
"learning_rate": 1.3957320164854059e-05, |
|
"loss": 1.4778, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.05038708830862092, |
|
"grad_norm": 3.433154344558716, |
|
"learning_rate": 1.373128144938563e-05, |
|
"loss": 1.2556, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.050518304684424614, |
|
"grad_norm": 3.4509999752044678, |
|
"learning_rate": 1.3506796365108232e-05, |
|
"loss": 1.3327, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.05064952106022832, |
|
"grad_norm": 3.3174984455108643, |
|
"learning_rate": 1.3283874528215733e-05, |
|
"loss": 1.2881, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.050780737436032015, |
|
"grad_norm": 3.912639856338501, |
|
"learning_rate": 1.3062525487937699e-05, |
|
"loss": 1.2161, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.05091195381183572, |
|
"grad_norm": 3.7738358974456787, |
|
"learning_rate": 1.2842758726130283e-05, |
|
"loss": 1.4717, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.05104317018763942, |
|
"grad_norm": 3.3311028480529785, |
|
"learning_rate": 1.2624583656870154e-05, |
|
"loss": 0.9475, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.05117438656344312, |
|
"grad_norm": 3.2263801097869873, |
|
"learning_rate": 1.2408009626051137e-05, |
|
"loss": 1.0097, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.05130560293924682, |
|
"grad_norm": 3.5462255477905273, |
|
"learning_rate": 1.2193045910983863e-05, |
|
"loss": 1.0301, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.051436819315050515, |
|
"grad_norm": 3.075239419937134, |
|
"learning_rate": 1.1979701719998453e-05, |
|
"loss": 0.8822, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.05156803569085422, |
|
"grad_norm": 3.8291819095611572, |
|
"learning_rate": 1.1767986192049984e-05, |
|
"loss": 1.0269, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.051699252066657916, |
|
"grad_norm": 3.1441256999969482, |
|
"learning_rate": 1.1557908396327028e-05, |
|
"loss": 0.8817, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.05183046844246162, |
|
"grad_norm": 3.654750347137451, |
|
"learning_rate": 1.134947733186315e-05, |
|
"loss": 0.8977, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.05196168481826532, |
|
"grad_norm": 3.509984254837036, |
|
"learning_rate": 1.1142701927151456e-05, |
|
"loss": 0.7434, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.05209290119406902, |
|
"grad_norm": 3.4480764865875244, |
|
"learning_rate": 1.0937591039762085e-05, |
|
"loss": 0.6737, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.05222411756987272, |
|
"grad_norm": 3.189035415649414, |
|
"learning_rate": 1.0734153455962765e-05, |
|
"loss": 0.5615, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.05235533394567642, |
|
"grad_norm": 2.6568944454193115, |
|
"learning_rate": 1.0532397890342505e-05, |
|
"loss": 0.4878, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.05248655032148012, |
|
"grad_norm": 4.114481449127197, |
|
"learning_rate": 1.0332332985438248e-05, |
|
"loss": 0.6082, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05248655032148012, |
|
"eval_loss": 1.4705314636230469, |
|
"eval_runtime": 1366.8691, |
|
"eval_samples_per_second": 9.391, |
|
"eval_steps_per_second": 2.348, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.052617766697283824, |
|
"grad_norm": 1.5547120571136475, |
|
"learning_rate": 1.013396731136465e-05, |
|
"loss": 1.8584, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.05274898307308752, |
|
"grad_norm": 1.4148300886154175, |
|
"learning_rate": 9.937309365446973e-06, |
|
"loss": 1.8086, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.05288019944889122, |
|
"grad_norm": 1.1794555187225342, |
|
"learning_rate": 9.742367571857091e-06, |
|
"loss": 1.7795, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.05301141582469492, |
|
"grad_norm": 1.2620103359222412, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 1.8577, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.05314263220049862, |
|
"grad_norm": 1.2999380826950073, |
|
"learning_rate": 9.357665770419244e-06, |
|
"loss": 1.9158, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.053273848576302324, |
|
"grad_norm": 1.7266931533813477, |
|
"learning_rate": 9.167922241916055e-06, |
|
"loss": 1.9962, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.05340506495210602, |
|
"grad_norm": 1.747637152671814, |
|
"learning_rate": 8.97992782372432e-06, |
|
"loss": 1.9808, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.053536281327909725, |
|
"grad_norm": 2.2035903930664062, |
|
"learning_rate": 8.793690568899216e-06, |
|
"loss": 1.975, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.05366749770371342, |
|
"grad_norm": 2.1186153888702393, |
|
"learning_rate": 8.609218455224893e-06, |
|
"loss": 1.9926, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.053798714079517126, |
|
"grad_norm": 2.213442087173462, |
|
"learning_rate": 8.426519384872733e-06, |
|
"loss": 1.9039, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.053929930455320824, |
|
"grad_norm": 2.3685803413391113, |
|
"learning_rate": 8.245601184062852e-06, |
|
"loss": 1.958, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.05406114683112453, |
|
"grad_norm": 2.664763927459717, |
|
"learning_rate": 8.066471602728803e-06, |
|
"loss": 1.9546, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.054192363206928225, |
|
"grad_norm": 2.719207286834717, |
|
"learning_rate": 7.889138314185678e-06, |
|
"loss": 1.9461, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.05432357958273192, |
|
"grad_norm": 2.6173901557922363, |
|
"learning_rate": 7.71360891480134e-06, |
|
"loss": 1.8008, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.054454795958535626, |
|
"grad_norm": 3.068253755569458, |
|
"learning_rate": 7.539890923671062e-06, |
|
"loss": 1.9327, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.05458601233433932, |
|
"grad_norm": 2.894899606704712, |
|
"learning_rate": 7.367991782295391e-06, |
|
"loss": 1.9416, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.05471722871014303, |
|
"grad_norm": 3.028637170791626, |
|
"learning_rate": 7.197918854261432e-06, |
|
"loss": 1.7818, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.054848445085946725, |
|
"grad_norm": 2.8751864433288574, |
|
"learning_rate": 7.029679424927365e-06, |
|
"loss": 1.8194, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.05497966146175043, |
|
"grad_norm": 3.282900810241699, |
|
"learning_rate": 6.863280701110408e-06, |
|
"loss": 1.8425, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.055110877837554126, |
|
"grad_norm": 3.1004960536956787, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 1.6386, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.05524209421335783, |
|
"grad_norm": 3.0315091609954834, |
|
"learning_rate": 6.536033802742813e-06, |
|
"loss": 1.792, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.05537331058916153, |
|
"grad_norm": 3.5640015602111816, |
|
"learning_rate": 6.375199646360142e-06, |
|
"loss": 1.8219, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.055504526964965224, |
|
"grad_norm": 3.106428861618042, |
|
"learning_rate": 6.216234231230012e-06, |
|
"loss": 1.5988, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.05563574334076893, |
|
"grad_norm": 3.2876532077789307, |
|
"learning_rate": 6.059144366901736e-06, |
|
"loss": 1.6089, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.055766959716572626, |
|
"grad_norm": 3.1995625495910645, |
|
"learning_rate": 5.903936782582253e-06, |
|
"loss": 1.597, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.05589817609237633, |
|
"grad_norm": 3.096595287322998, |
|
"learning_rate": 5.750618126847912e-06, |
|
"loss": 1.3686, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.05602939246818003, |
|
"grad_norm": 3.0396366119384766, |
|
"learning_rate": 5.599194967359639e-06, |
|
"loss": 1.5684, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.05616060884398373, |
|
"grad_norm": 3.076733350753784, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 1.4296, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.05629182521978743, |
|
"grad_norm": 3.201915740966797, |
|
"learning_rate": 5.302061001503394e-06, |
|
"loss": 1.6379, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.05642304159559113, |
|
"grad_norm": 3.393874168395996, |
|
"learning_rate": 5.156362923365588e-06, |
|
"loss": 1.5963, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.05655425797139483, |
|
"grad_norm": 3.4268441200256348, |
|
"learning_rate": 5.012585797388936e-06, |
|
"loss": 1.3698, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.05668547434719853, |
|
"grad_norm": 3.298431158065796, |
|
"learning_rate": 4.87073578250698e-06, |
|
"loss": 1.5274, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.05681669072300223, |
|
"grad_norm": 3.310758352279663, |
|
"learning_rate": 4.730818955102234e-06, |
|
"loss": 1.2892, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.05694790709880593, |
|
"grad_norm": 3.1434218883514404, |
|
"learning_rate": 4.592841308745932e-06, |
|
"loss": 1.2763, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.05707912347460963, |
|
"grad_norm": 3.1354024410247803, |
|
"learning_rate": 4.456808753941205e-06, |
|
"loss": 1.2465, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.05721033985041333, |
|
"grad_norm": 3.235828161239624, |
|
"learning_rate": 4.322727117869951e-06, |
|
"loss": 1.1989, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.05734155622621703, |
|
"grad_norm": 3.4798684120178223, |
|
"learning_rate": 4.190602144143207e-06, |
|
"loss": 1.3004, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.05747277260202073, |
|
"grad_norm": 3.5043392181396484, |
|
"learning_rate": 4.06043949255509e-06, |
|
"loss": 1.1935, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.057603988977824434, |
|
"grad_norm": 3.677044630050659, |
|
"learning_rate": 3.932244738840379e-06, |
|
"loss": 1.2309, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.05773520535362813, |
|
"grad_norm": 4.231906414031982, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 1.2004, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.057866421729431836, |
|
"grad_norm": 3.8758881092071533, |
|
"learning_rate": 3.681780806244095e-06, |
|
"loss": 1.1777, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.05799763810523553, |
|
"grad_norm": 3.5995683670043945, |
|
"learning_rate": 3.5595223564037884e-06, |
|
"loss": 1.1675, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.05812885448103924, |
|
"grad_norm": 3.3509786128997803, |
|
"learning_rate": 3.4392532620598216e-06, |
|
"loss": 1.1342, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.058260070856842934, |
|
"grad_norm": 3.3423001766204834, |
|
"learning_rate": 3.3209786751399187e-06, |
|
"loss": 1.0987, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.05839128723264663, |
|
"grad_norm": 3.2946929931640625, |
|
"learning_rate": 3.2047036621337236e-06, |
|
"loss": 0.8206, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.058522503608450335, |
|
"grad_norm": 3.773805618286133, |
|
"learning_rate": 3.0904332038757977e-06, |
|
"loss": 0.792, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.05865371998425403, |
|
"grad_norm": 3.8902366161346436, |
|
"learning_rate": 2.978172195332263e-06, |
|
"loss": 0.9764, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.05878493636005774, |
|
"grad_norm": 3.055663824081421, |
|
"learning_rate": 2.8679254453910785e-06, |
|
"loss": 0.5963, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.058916152735861434, |
|
"grad_norm": 2.5872833728790283, |
|
"learning_rate": 2.759697676656098e-06, |
|
"loss": 0.4251, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.05904736911166514, |
|
"grad_norm": 2.976865530014038, |
|
"learning_rate": 2.653493525244721e-06, |
|
"loss": 0.5668, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.059178585487468835, |
|
"grad_norm": 1.0085222721099854, |
|
"learning_rate": 2.549317540589308e-06, |
|
"loss": 1.8009, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.05930980186327254, |
|
"grad_norm": 1.0669842958450317, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 1.7289, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.059441018239076236, |
|
"grad_norm": 1.1122716665267944, |
|
"learning_rate": 2.3470678346851518e-06, |
|
"loss": 1.8283, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.05957223461487994, |
|
"grad_norm": 1.1549385786056519, |
|
"learning_rate": 2.2490027771406687e-06, |
|
"loss": 1.925, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.05970345099068364, |
|
"grad_norm": 1.1655536890029907, |
|
"learning_rate": 2.152983213389559e-06, |
|
"loss": 1.9207, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.059834667366487335, |
|
"grad_norm": 1.4617455005645752, |
|
"learning_rate": 2.0590132565903476e-06, |
|
"loss": 1.9206, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.05996588374229104, |
|
"grad_norm": 1.655059576034546, |
|
"learning_rate": 1.9670969321032407e-06, |
|
"loss": 1.9933, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.060097100118094736, |
|
"grad_norm": 1.801466703414917, |
|
"learning_rate": 1.8772381773176417e-06, |
|
"loss": 1.9565, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.06022831649389844, |
|
"grad_norm": 1.9467339515686035, |
|
"learning_rate": 1.7894408414835362e-06, |
|
"loss": 1.9179, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.06035953286970214, |
|
"grad_norm": 2.1917128562927246, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 1.9006, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.06049074924550584, |
|
"grad_norm": 2.31794810295105, |
|
"learning_rate": 1.620045381987012e-06, |
|
"loss": 1.8937, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.06062196562130954, |
|
"grad_norm": 2.556521415710449, |
|
"learning_rate": 1.5384545146622852e-06, |
|
"loss": 1.9225, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.06075318199711324, |
|
"grad_norm": 2.7441983222961426, |
|
"learning_rate": 1.4589395786535953e-06, |
|
"loss": 1.7219, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.06088439837291694, |
|
"grad_norm": 2.604498863220215, |
|
"learning_rate": 1.3815039801161721e-06, |
|
"loss": 1.7661, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.061015614748720644, |
|
"grad_norm": 2.8174169063568115, |
|
"learning_rate": 1.3061510361333185e-06, |
|
"loss": 1.8408, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.06114683112452434, |
|
"grad_norm": 2.8200817108154297, |
|
"learning_rate": 1.232883974574367e-06, |
|
"loss": 1.7759, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.06127804750032804, |
|
"grad_norm": 3.005772829055786, |
|
"learning_rate": 1.1617059339563807e-06, |
|
"loss": 1.7675, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.06140926387613174, |
|
"grad_norm": 2.790365219116211, |
|
"learning_rate": 1.0926199633097157e-06, |
|
"loss": 1.6673, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.06154048025193544, |
|
"grad_norm": 2.7628681659698486, |
|
"learning_rate": 1.0256290220474307e-06, |
|
"loss": 1.5659, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.06167169662773914, |
|
"grad_norm": 2.9881839752197266, |
|
"learning_rate": 9.607359798384785e-07, |
|
"loss": 1.6035, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.06180291300354284, |
|
"grad_norm": 3.119401693344116, |
|
"learning_rate": 8.979436164848088e-07, |
|
"loss": 1.7385, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.061934129379346545, |
|
"grad_norm": 3.145608425140381, |
|
"learning_rate": 8.372546218022747e-07, |
|
"loss": 1.8044, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.06206534575515024, |
|
"grad_norm": 3.2894794940948486, |
|
"learning_rate": 7.786715955054203e-07, |
|
"loss": 1.6499, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.062196562130953946, |
|
"grad_norm": 3.073772430419922, |
|
"learning_rate": 7.221970470961125e-07, |
|
"loss": 1.6461, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.06232777850675764, |
|
"grad_norm": 2.914522647857666, |
|
"learning_rate": 6.678333957560512e-07, |
|
"loss": 1.4814, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.06245899488256134, |
|
"grad_norm": 3.2599356174468994, |
|
"learning_rate": 6.15582970243117e-07, |
|
"loss": 1.5119, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.06259021125836504, |
|
"grad_norm": 3.0420191287994385, |
|
"learning_rate": 5.654480087916303e-07, |
|
"loss": 1.5962, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.06272142763416874, |
|
"grad_norm": 3.278502941131592, |
|
"learning_rate": 5.174306590164879e-07, |
|
"loss": 1.5504, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.06285264400997244, |
|
"grad_norm": 2.9558990001678467, |
|
"learning_rate": 4.715329778211375e-07, |
|
"loss": 1.4017, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.06298386038577615, |
|
"grad_norm": 3.2562479972839355, |
|
"learning_rate": 4.277569313094809e-07, |
|
"loss": 1.3569, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.06311507676157985, |
|
"grad_norm": 3.3667497634887695, |
|
"learning_rate": 3.8610439470164737e-07, |
|
"loss": 1.452, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.06324629313738354, |
|
"grad_norm": 3.1160595417022705, |
|
"learning_rate": 3.465771522536854e-07, |
|
"loss": 1.2796, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.06337750951318724, |
|
"grad_norm": 3.3417913913726807, |
|
"learning_rate": 3.09176897181096e-07, |
|
"loss": 1.4638, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.06350872588899095, |
|
"grad_norm": 3.3148908615112305, |
|
"learning_rate": 2.7390523158633554e-07, |
|
"loss": 1.3271, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.06363994226479465, |
|
"grad_norm": 3.5485448837280273, |
|
"learning_rate": 2.407636663901591e-07, |
|
"loss": 1.4274, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.06377115864059835, |
|
"grad_norm": 3.49263858795166, |
|
"learning_rate": 2.0975362126691712e-07, |
|
"loss": 1.3793, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.06390237501640204, |
|
"grad_norm": 3.290903091430664, |
|
"learning_rate": 1.8087642458373134e-07, |
|
"loss": 1.2698, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.06403359139220574, |
|
"grad_norm": 3.2131378650665283, |
|
"learning_rate": 1.5413331334360182e-07, |
|
"loss": 1.1554, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.06416480776800945, |
|
"grad_norm": 3.7000808715820312, |
|
"learning_rate": 1.2952543313240472e-07, |
|
"loss": 1.2878, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.06429602414381315, |
|
"grad_norm": 3.0949819087982178, |
|
"learning_rate": 1.0705383806982606e-07, |
|
"loss": 1.0326, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.06442724051961685, |
|
"grad_norm": 3.2864902019500732, |
|
"learning_rate": 8.671949076420882e-08, |
|
"loss": 1.041, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.06455845689542054, |
|
"grad_norm": 3.193171262741089, |
|
"learning_rate": 6.852326227130834e-08, |
|
"loss": 0.9282, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.06468967327122425, |
|
"grad_norm": 3.8569085597991943, |
|
"learning_rate": 5.246593205699424e-08, |
|
"loss": 0.8974, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.06482088964702795, |
|
"grad_norm": 3.4853317737579346, |
|
"learning_rate": 3.8548187963854956e-08, |
|
"loss": 0.797, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.06495210602283165, |
|
"grad_norm": 3.2885305881500244, |
|
"learning_rate": 2.6770626181715773e-08, |
|
"loss": 0.9086, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.06508332239863535, |
|
"grad_norm": 3.696621894836426, |
|
"learning_rate": 1.7133751222137007e-08, |
|
"loss": 0.8038, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.06521453877443904, |
|
"grad_norm": 3.6335575580596924, |
|
"learning_rate": 9.637975896759077e-09, |
|
"loss": 0.9278, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.06534575515024275, |
|
"grad_norm": 3.405994176864624, |
|
"learning_rate": 4.2836212996499865e-09, |
|
"loss": 0.7081, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.06547697152604645, |
|
"grad_norm": 2.7525062561035156, |
|
"learning_rate": 1.0709167935385455e-09, |
|
"loss": 0.5322, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.06560818790185015, |
|
"grad_norm": 3.2091686725616455, |
|
"learning_rate": 0.0, |
|
"loss": 0.491, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06560818790185015, |
|
"eval_loss": 1.454202651977539, |
|
"eval_runtime": 1356.7364, |
|
"eval_samples_per_second": 9.461, |
|
"eval_steps_per_second": 2.365, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.026400488001372e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|