UCCIX-Llama2-13B-Instruct-191224
/
uccix_instruct_191224_lr1e-5
/checkpoint-1092
/trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 7.0, | |
"eval_steps": 500, | |
"global_step": 1092, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.00641025641025641, | |
"grad_norm": 3.8148568052575884, | |
"learning_rate": 1.282051282051282e-07, | |
"loss": 4.889, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.01282051282051282, | |
"grad_norm": 4.453444589892027, | |
"learning_rate": 2.564102564102564e-07, | |
"loss": 4.9097, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.02564102564102564, | |
"grad_norm": 4.896614258621833, | |
"learning_rate": 5.128205128205128e-07, | |
"loss": 4.9099, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.038461538461538464, | |
"grad_norm": 4.456576485464451, | |
"learning_rate": 7.692307692307694e-07, | |
"loss": 4.9102, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.05128205128205128, | |
"grad_norm": 4.193427815120892, | |
"learning_rate": 1.0256410256410257e-06, | |
"loss": 4.8924, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.0641025641025641, | |
"grad_norm": 3.6726747534666555, | |
"learning_rate": 1.282051282051282e-06, | |
"loss": 4.8372, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.07692307692307693, | |
"grad_norm": 3.337981680961211, | |
"learning_rate": 1.5384615384615387e-06, | |
"loss": 4.7794, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.08974358974358974, | |
"grad_norm": 2.675890453922504, | |
"learning_rate": 1.794871794871795e-06, | |
"loss": 4.6191, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.10256410256410256, | |
"grad_norm": 2.398848700299253, | |
"learning_rate": 2.0512820512820513e-06, | |
"loss": 4.5723, | |
"step": 16 | |
}, | |
{ | |
"epoch": 0.11538461538461539, | |
"grad_norm": 1.8159784961859098, | |
"learning_rate": 2.307692307692308e-06, | |
"loss": 4.3568, | |
"step": 18 | |
}, | |
{ | |
"epoch": 0.1282051282051282, | |
"grad_norm": 1.6094220673057946, | |
"learning_rate": 2.564102564102564e-06, | |
"loss": 4.2686, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.14102564102564102, | |
"grad_norm": 1.4349818434671497, | |
"learning_rate": 2.8205128205128207e-06, | |
"loss": 4.169, | |
"step": 22 | |
}, | |
{ | |
"epoch": 0.15384615384615385, | |
"grad_norm": 1.4412559958198408, | |
"learning_rate": 3.0769230769230774e-06, | |
"loss": 4.0415, | |
"step": 24 | |
}, | |
{ | |
"epoch": 0.16666666666666666, | |
"grad_norm": 1.3626982007755366, | |
"learning_rate": 3.3333333333333333e-06, | |
"loss": 3.8569, | |
"step": 26 | |
}, | |
{ | |
"epoch": 0.1794871794871795, | |
"grad_norm": 1.3679096739652512, | |
"learning_rate": 3.58974358974359e-06, | |
"loss": 3.7409, | |
"step": 28 | |
}, | |
{ | |
"epoch": 0.19230769230769232, | |
"grad_norm": 1.3396391976584703, | |
"learning_rate": 3.846153846153847e-06, | |
"loss": 3.6585, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.20512820512820512, | |
"grad_norm": 1.294876480457606, | |
"learning_rate": 4.102564102564103e-06, | |
"loss": 3.4961, | |
"step": 32 | |
}, | |
{ | |
"epoch": 0.21794871794871795, | |
"grad_norm": 1.103820056614455, | |
"learning_rate": 4.358974358974359e-06, | |
"loss": 3.3518, | |
"step": 34 | |
}, | |
{ | |
"epoch": 0.23076923076923078, | |
"grad_norm": 1.0522131115906572, | |
"learning_rate": 4.615384615384616e-06, | |
"loss": 3.1984, | |
"step": 36 | |
}, | |
{ | |
"epoch": 0.24358974358974358, | |
"grad_norm": 1.0081732884085817, | |
"learning_rate": 4.871794871794872e-06, | |
"loss": 3.054, | |
"step": 38 | |
}, | |
{ | |
"epoch": 0.2564102564102564, | |
"grad_norm": 0.9214039999549644, | |
"learning_rate": 5.128205128205128e-06, | |
"loss": 2.8628, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.2692307692307692, | |
"grad_norm": 0.8143994876297143, | |
"learning_rate": 5.384615384615385e-06, | |
"loss": 2.7475, | |
"step": 42 | |
}, | |
{ | |
"epoch": 0.28205128205128205, | |
"grad_norm": 0.700891765547207, | |
"learning_rate": 5.641025641025641e-06, | |
"loss": 2.5869, | |
"step": 44 | |
}, | |
{ | |
"epoch": 0.2948717948717949, | |
"grad_norm": 0.7510674065754775, | |
"learning_rate": 5.897435897435898e-06, | |
"loss": 2.4461, | |
"step": 46 | |
}, | |
{ | |
"epoch": 0.3076923076923077, | |
"grad_norm": 0.6794074940373539, | |
"learning_rate": 6.153846153846155e-06, | |
"loss": 2.3477, | |
"step": 48 | |
}, | |
{ | |
"epoch": 0.32051282051282054, | |
"grad_norm": 0.5162215042692575, | |
"learning_rate": 6.410256410256412e-06, | |
"loss": 2.2152, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"grad_norm": 0.5146975027904754, | |
"learning_rate": 6.666666666666667e-06, | |
"loss": 2.1975, | |
"step": 52 | |
}, | |
{ | |
"epoch": 0.34615384615384615, | |
"grad_norm": 0.4474574545979082, | |
"learning_rate": 6.923076923076923e-06, | |
"loss": 2.0824, | |
"step": 54 | |
}, | |
{ | |
"epoch": 0.358974358974359, | |
"grad_norm": 0.40379510918119965, | |
"learning_rate": 7.17948717948718e-06, | |
"loss": 2.0388, | |
"step": 56 | |
}, | |
{ | |
"epoch": 0.3717948717948718, | |
"grad_norm": 0.4109144194248555, | |
"learning_rate": 7.435897435897437e-06, | |
"loss": 1.9699, | |
"step": 58 | |
}, | |
{ | |
"epoch": 0.38461538461538464, | |
"grad_norm": 0.36878556755849573, | |
"learning_rate": 7.692307692307694e-06, | |
"loss": 1.9252, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.3974358974358974, | |
"grad_norm": 0.33951214974325605, | |
"learning_rate": 7.948717948717949e-06, | |
"loss": 1.8773, | |
"step": 62 | |
}, | |
{ | |
"epoch": 0.41025641025641024, | |
"grad_norm": 0.31625266306424027, | |
"learning_rate": 8.205128205128205e-06, | |
"loss": 1.7966, | |
"step": 64 | |
}, | |
{ | |
"epoch": 0.4230769230769231, | |
"grad_norm": 0.7180890498799148, | |
"learning_rate": 8.461538461538462e-06, | |
"loss": 1.8108, | |
"step": 66 | |
}, | |
{ | |
"epoch": 0.4358974358974359, | |
"grad_norm": 0.33704662479371716, | |
"learning_rate": 8.717948717948719e-06, | |
"loss": 1.7498, | |
"step": 68 | |
}, | |
{ | |
"epoch": 0.44871794871794873, | |
"grad_norm": 0.2761824271642518, | |
"learning_rate": 8.974358974358976e-06, | |
"loss": 1.7124, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.46153846153846156, | |
"grad_norm": 0.24386286193528572, | |
"learning_rate": 9.230769230769232e-06, | |
"loss": 1.6382, | |
"step": 72 | |
}, | |
{ | |
"epoch": 0.47435897435897434, | |
"grad_norm": 0.25885451676676363, | |
"learning_rate": 9.487179487179487e-06, | |
"loss": 1.6588, | |
"step": 74 | |
}, | |
{ | |
"epoch": 0.48717948717948717, | |
"grad_norm": 0.3040030663690383, | |
"learning_rate": 9.743589743589744e-06, | |
"loss": 1.6209, | |
"step": 76 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 0.26598080566137733, | |
"learning_rate": 1e-05, | |
"loss": 1.6294, | |
"step": 78 | |
}, | |
{ | |
"epoch": 0.5128205128205128, | |
"grad_norm": 0.22696288673824674, | |
"learning_rate": 9.99995506314361e-06, | |
"loss": 1.58, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.5256410256410257, | |
"grad_norm": 0.21242259411358655, | |
"learning_rate": 9.99982025338217e-06, | |
"loss": 1.5439, | |
"step": 82 | |
}, | |
{ | |
"epoch": 0.5384615384615384, | |
"grad_norm": 0.20291826899403465, | |
"learning_rate": 9.999595573138845e-06, | |
"loss": 1.5274, | |
"step": 84 | |
}, | |
{ | |
"epoch": 0.5512820512820513, | |
"grad_norm": 0.1855444412322797, | |
"learning_rate": 9.99928102645221e-06, | |
"loss": 1.5161, | |
"step": 86 | |
}, | |
{ | |
"epoch": 0.5641025641025641, | |
"grad_norm": 0.17883874148398324, | |
"learning_rate": 9.99887661897616e-06, | |
"loss": 1.4916, | |
"step": 88 | |
}, | |
{ | |
"epoch": 0.5769230769230769, | |
"grad_norm": 0.17041478792908024, | |
"learning_rate": 9.99838235797981e-06, | |
"loss": 1.4679, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.5897435897435898, | |
"grad_norm": 0.1904762198987749, | |
"learning_rate": 9.997798252347382e-06, | |
"loss": 1.471, | |
"step": 92 | |
}, | |
{ | |
"epoch": 0.6025641025641025, | |
"grad_norm": 0.19077041355708335, | |
"learning_rate": 9.99712431257802e-06, | |
"loss": 1.4672, | |
"step": 94 | |
}, | |
{ | |
"epoch": 0.6153846153846154, | |
"grad_norm": 0.1702104328191874, | |
"learning_rate": 9.996360550785619e-06, | |
"loss": 1.4455, | |
"step": 96 | |
}, | |
{ | |
"epoch": 0.6282051282051282, | |
"grad_norm": 0.19039133859515542, | |
"learning_rate": 9.9955069806986e-06, | |
"loss": 1.4727, | |
"step": 98 | |
}, | |
{ | |
"epoch": 0.6410256410256411, | |
"grad_norm": 0.15448238517128507, | |
"learning_rate": 9.994563617659665e-06, | |
"loss": 1.4257, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.6538461538461539, | |
"grad_norm": 0.15202351051018634, | |
"learning_rate": 9.993530478625524e-06, | |
"loss": 1.4214, | |
"step": 102 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"grad_norm": 0.16296598133044526, | |
"learning_rate": 9.992407582166582e-06, | |
"loss": 1.4213, | |
"step": 104 | |
}, | |
{ | |
"epoch": 0.6794871794871795, | |
"grad_norm": 0.1462038294164801, | |
"learning_rate": 9.991194948466615e-06, | |
"loss": 1.3993, | |
"step": 106 | |
}, | |
{ | |
"epoch": 0.6923076923076923, | |
"grad_norm": 0.14470989191451086, | |
"learning_rate": 9.989892599322404e-06, | |
"loss": 1.4014, | |
"step": 108 | |
}, | |
{ | |
"epoch": 0.7051282051282052, | |
"grad_norm": 0.15440545758233384, | |
"learning_rate": 9.988500558143337e-06, | |
"loss": 1.3878, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.717948717948718, | |
"grad_norm": 0.1412948019214843, | |
"learning_rate": 9.987018849950996e-06, | |
"loss": 1.355, | |
"step": 112 | |
}, | |
{ | |
"epoch": 0.7307692307692307, | |
"grad_norm": 0.15156074653795895, | |
"learning_rate": 9.985447501378706e-06, | |
"loss": 1.3642, | |
"step": 114 | |
}, | |
{ | |
"epoch": 0.7435897435897436, | |
"grad_norm": 0.3875845143038168, | |
"learning_rate": 9.983786540671052e-06, | |
"loss": 1.3797, | |
"step": 116 | |
}, | |
{ | |
"epoch": 0.7564102564102564, | |
"grad_norm": 0.15788537547887518, | |
"learning_rate": 9.982035997683372e-06, | |
"loss": 1.3388, | |
"step": 118 | |
}, | |
{ | |
"epoch": 0.7692307692307693, | |
"grad_norm": 0.15056320914445512, | |
"learning_rate": 9.980195903881231e-06, | |
"loss": 1.343, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.782051282051282, | |
"grad_norm": 0.1555129283317706, | |
"learning_rate": 9.978266292339838e-06, | |
"loss": 1.328, | |
"step": 122 | |
}, | |
{ | |
"epoch": 0.7948717948717948, | |
"grad_norm": 0.14999182496915453, | |
"learning_rate": 9.976247197743465e-06, | |
"loss": 1.352, | |
"step": 124 | |
}, | |
{ | |
"epoch": 0.8076923076923077, | |
"grad_norm": 0.14124313426191026, | |
"learning_rate": 9.974138656384815e-06, | |
"loss": 1.3243, | |
"step": 126 | |
}, | |
{ | |
"epoch": 0.8205128205128205, | |
"grad_norm": 0.1378326204862212, | |
"learning_rate": 9.97194070616438e-06, | |
"loss": 1.3241, | |
"step": 128 | |
}, | |
{ | |
"epoch": 0.8333333333333334, | |
"grad_norm": 0.14227960534974604, | |
"learning_rate": 9.969653386589749e-06, | |
"loss": 1.3219, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.8461538461538461, | |
"grad_norm": 0.12713543749272155, | |
"learning_rate": 9.967276738774897e-06, | |
"loss": 1.3096, | |
"step": 132 | |
}, | |
{ | |
"epoch": 0.8589743589743589, | |
"grad_norm": 0.15061232362563903, | |
"learning_rate": 9.964810805439464e-06, | |
"loss": 1.3011, | |
"step": 134 | |
}, | |
{ | |
"epoch": 0.8717948717948718, | |
"grad_norm": 0.14361563348990292, | |
"learning_rate": 9.962255630907964e-06, | |
"loss": 1.2827, | |
"step": 136 | |
}, | |
{ | |
"epoch": 0.8846153846153846, | |
"grad_norm": 0.17754387209035652, | |
"learning_rate": 9.959611261108999e-06, | |
"loss": 1.3185, | |
"step": 138 | |
}, | |
{ | |
"epoch": 0.8974358974358975, | |
"grad_norm": 0.1458623897430443, | |
"learning_rate": 9.956877743574437e-06, | |
"loss": 1.3286, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.9102564102564102, | |
"grad_norm": 0.14084398418567437, | |
"learning_rate": 9.954055127438554e-06, | |
"loss": 1.3005, | |
"step": 142 | |
}, | |
{ | |
"epoch": 0.9230769230769231, | |
"grad_norm": 0.13580861113069753, | |
"learning_rate": 9.951143463437145e-06, | |
"loss": 1.3165, | |
"step": 144 | |
}, | |
{ | |
"epoch": 0.9358974358974359, | |
"grad_norm": 0.13622051889734035, | |
"learning_rate": 9.948142803906623e-06, | |
"loss": 1.2929, | |
"step": 146 | |
}, | |
{ | |
"epoch": 0.9487179487179487, | |
"grad_norm": 0.12679082371935066, | |
"learning_rate": 9.94505320278307e-06, | |
"loss": 1.2833, | |
"step": 148 | |
}, | |
{ | |
"epoch": 0.9615384615384616, | |
"grad_norm": 0.11939382079952243, | |
"learning_rate": 9.94187471560127e-06, | |
"loss": 1.2851, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.9743589743589743, | |
"grad_norm": 0.11752490134274678, | |
"learning_rate": 9.938607399493714e-06, | |
"loss": 1.2559, | |
"step": 152 | |
}, | |
{ | |
"epoch": 0.9871794871794872, | |
"grad_norm": 0.11807212671773365, | |
"learning_rate": 9.935251313189564e-06, | |
"loss": 1.285, | |
"step": 154 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 0.1120761333795772, | |
"learning_rate": 9.931806517013612e-06, | |
"loss": 1.2491, | |
"step": 156 | |
}, | |
{ | |
"epoch": 1.0128205128205128, | |
"grad_norm": 0.10750345822189263, | |
"learning_rate": 9.92827307288518e-06, | |
"loss": 1.2442, | |
"step": 158 | |
}, | |
{ | |
"epoch": 1.0256410256410255, | |
"grad_norm": 0.10918642022881683, | |
"learning_rate": 9.924651044317017e-06, | |
"loss": 1.2286, | |
"step": 160 | |
}, | |
{ | |
"epoch": 1.0384615384615385, | |
"grad_norm": 0.11225330042691335, | |
"learning_rate": 9.920940496414153e-06, | |
"loss": 1.2158, | |
"step": 162 | |
}, | |
{ | |
"epoch": 1.0512820512820513, | |
"grad_norm": 0.11366482652198566, | |
"learning_rate": 9.917141495872733e-06, | |
"loss": 1.2074, | |
"step": 164 | |
}, | |
{ | |
"epoch": 1.064102564102564, | |
"grad_norm": 0.12295651003296312, | |
"learning_rate": 9.913254110978812e-06, | |
"loss": 1.2003, | |
"step": 166 | |
}, | |
{ | |
"epoch": 1.0769230769230769, | |
"grad_norm": 0.1144456030840293, | |
"learning_rate": 9.909278411607134e-06, | |
"loss": 1.206, | |
"step": 168 | |
}, | |
{ | |
"epoch": 1.0897435897435896, | |
"grad_norm": 0.2468334129961725, | |
"learning_rate": 9.90521446921987e-06, | |
"loss": 1.2235, | |
"step": 170 | |
}, | |
{ | |
"epoch": 1.1025641025641026, | |
"grad_norm": 0.127278158070263, | |
"learning_rate": 9.90106235686534e-06, | |
"loss": 1.1928, | |
"step": 172 | |
}, | |
{ | |
"epoch": 1.1153846153846154, | |
"grad_norm": 0.1280282060730887, | |
"learning_rate": 9.896822149176695e-06, | |
"loss": 1.2068, | |
"step": 174 | |
}, | |
{ | |
"epoch": 1.1282051282051282, | |
"grad_norm": 0.1142922422404122, | |
"learning_rate": 9.892493922370575e-06, | |
"loss": 1.217, | |
"step": 176 | |
}, | |
{ | |
"epoch": 1.141025641025641, | |
"grad_norm": 0.17470470224878323, | |
"learning_rate": 9.888077754245741e-06, | |
"loss": 1.2099, | |
"step": 178 | |
}, | |
{ | |
"epoch": 1.1538461538461537, | |
"grad_norm": 0.10477882692325258, | |
"learning_rate": 9.883573724181683e-06, | |
"loss": 1.1944, | |
"step": 180 | |
}, | |
{ | |
"epoch": 1.1666666666666667, | |
"grad_norm": 0.114790034377695, | |
"learning_rate": 9.878981913137178e-06, | |
"loss": 1.172, | |
"step": 182 | |
}, | |
{ | |
"epoch": 1.1794871794871795, | |
"grad_norm": 0.1044922535107306, | |
"learning_rate": 9.87430240364885e-06, | |
"loss": 1.2147, | |
"step": 184 | |
}, | |
{ | |
"epoch": 1.1923076923076923, | |
"grad_norm": 0.09771283060341285, | |
"learning_rate": 9.869535279829674e-06, | |
"loss": 1.173, | |
"step": 186 | |
}, | |
{ | |
"epoch": 1.205128205128205, | |
"grad_norm": 0.1013995999635824, | |
"learning_rate": 9.864680627367476e-06, | |
"loss": 1.2023, | |
"step": 188 | |
}, | |
{ | |
"epoch": 1.217948717948718, | |
"grad_norm": 0.10273326452887067, | |
"learning_rate": 9.859738533523384e-06, | |
"loss": 1.1732, | |
"step": 190 | |
}, | |
{ | |
"epoch": 1.2307692307692308, | |
"grad_norm": 0.09684048616936082, | |
"learning_rate": 9.854709087130261e-06, | |
"loss": 1.1952, | |
"step": 192 | |
}, | |
{ | |
"epoch": 1.2435897435897436, | |
"grad_norm": 0.10827760658070901, | |
"learning_rate": 9.849592378591113e-06, | |
"loss": 1.1864, | |
"step": 194 | |
}, | |
{ | |
"epoch": 1.2564102564102564, | |
"grad_norm": 0.09989527940011267, | |
"learning_rate": 9.844388499877457e-06, | |
"loss": 1.2016, | |
"step": 196 | |
}, | |
{ | |
"epoch": 1.2692307692307692, | |
"grad_norm": 0.09930771667309381, | |
"learning_rate": 9.839097544527674e-06, | |
"loss": 1.1738, | |
"step": 198 | |
}, | |
{ | |
"epoch": 1.282051282051282, | |
"grad_norm": 0.1032001919164007, | |
"learning_rate": 9.833719607645325e-06, | |
"loss": 1.176, | |
"step": 200 | |
}, | |
{ | |
"epoch": 1.294871794871795, | |
"grad_norm": 0.09859412157061716, | |
"learning_rate": 9.82825478589744e-06, | |
"loss": 1.1682, | |
"step": 202 | |
}, | |
{ | |
"epoch": 1.3076923076923077, | |
"grad_norm": 0.09558235334437347, | |
"learning_rate": 9.822703177512783e-06, | |
"loss": 1.181, | |
"step": 204 | |
}, | |
{ | |
"epoch": 1.3205128205128205, | |
"grad_norm": 0.08733478657745303, | |
"learning_rate": 9.817064882280085e-06, | |
"loss": 1.1686, | |
"step": 206 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"grad_norm": 0.09397505343456257, | |
"learning_rate": 9.811340001546252e-06, | |
"loss": 1.1778, | |
"step": 208 | |
}, | |
{ | |
"epoch": 1.3461538461538463, | |
"grad_norm": 0.09590407825516856, | |
"learning_rate": 9.805528638214543e-06, | |
"loss": 1.1542, | |
"step": 210 | |
}, | |
{ | |
"epoch": 1.358974358974359, | |
"grad_norm": 0.0912508440064145, | |
"learning_rate": 9.799630896742716e-06, | |
"loss": 1.1643, | |
"step": 212 | |
}, | |
{ | |
"epoch": 1.3717948717948718, | |
"grad_norm": 0.09258955107744923, | |
"learning_rate": 9.793646883141155e-06, | |
"loss": 1.1686, | |
"step": 214 | |
}, | |
{ | |
"epoch": 1.3846153846153846, | |
"grad_norm": 0.09889457149777804, | |
"learning_rate": 9.787576704970965e-06, | |
"loss": 1.1677, | |
"step": 216 | |
}, | |
{ | |
"epoch": 1.3974358974358974, | |
"grad_norm": 0.09374670756166416, | |
"learning_rate": 9.781420471342035e-06, | |
"loss": 1.146, | |
"step": 218 | |
}, | |
{ | |
"epoch": 1.4102564102564101, | |
"grad_norm": 0.09136677460744856, | |
"learning_rate": 9.77517829291108e-06, | |
"loss": 1.1594, | |
"step": 220 | |
}, | |
{ | |
"epoch": 1.4230769230769231, | |
"grad_norm": 0.10584946030378292, | |
"learning_rate": 9.768850281879651e-06, | |
"loss": 1.1865, | |
"step": 222 | |
}, | |
{ | |
"epoch": 1.435897435897436, | |
"grad_norm": 0.09187981607301214, | |
"learning_rate": 9.762436551992117e-06, | |
"loss": 1.1606, | |
"step": 224 | |
}, | |
{ | |
"epoch": 1.4487179487179487, | |
"grad_norm": 0.09880449655805854, | |
"learning_rate": 9.755937218533622e-06, | |
"loss": 1.1586, | |
"step": 226 | |
}, | |
{ | |
"epoch": 1.4615384615384617, | |
"grad_norm": 0.08704607108972029, | |
"learning_rate": 9.74935239832801e-06, | |
"loss": 1.1746, | |
"step": 228 | |
}, | |
{ | |
"epoch": 1.4743589743589745, | |
"grad_norm": 0.08909112778091671, | |
"learning_rate": 9.742682209735727e-06, | |
"loss": 1.1575, | |
"step": 230 | |
}, | |
{ | |
"epoch": 1.4871794871794872, | |
"grad_norm": 0.09035998053799675, | |
"learning_rate": 9.735926772651703e-06, | |
"loss": 1.1678, | |
"step": 232 | |
}, | |
{ | |
"epoch": 1.5, | |
"grad_norm": 0.09500864788295198, | |
"learning_rate": 9.729086208503174e-06, | |
"loss": 1.1466, | |
"step": 234 | |
}, | |
{ | |
"epoch": 1.5128205128205128, | |
"grad_norm": 0.09247434213683463, | |
"learning_rate": 9.722160640247523e-06, | |
"loss": 1.1687, | |
"step": 236 | |
}, | |
{ | |
"epoch": 1.5256410256410255, | |
"grad_norm": 0.09322212100100113, | |
"learning_rate": 9.715150192370054e-06, | |
"loss": 1.1376, | |
"step": 238 | |
}, | |
{ | |
"epoch": 1.5384615384615383, | |
"grad_norm": 0.08824919508271642, | |
"learning_rate": 9.708054990881763e-06, | |
"loss": 1.1523, | |
"step": 240 | |
}, | |
{ | |
"epoch": 1.5512820512820513, | |
"grad_norm": 0.25559730635424294, | |
"learning_rate": 9.700875163317072e-06, | |
"loss": 1.1488, | |
"step": 242 | |
}, | |
{ | |
"epoch": 1.564102564102564, | |
"grad_norm": 0.2487505162861363, | |
"learning_rate": 9.693610838731532e-06, | |
"loss": 1.1481, | |
"step": 244 | |
}, | |
{ | |
"epoch": 1.5769230769230769, | |
"grad_norm": 0.12151469789600829, | |
"learning_rate": 9.686262147699507e-06, | |
"loss": 1.1483, | |
"step": 246 | |
}, | |
{ | |
"epoch": 1.5897435897435899, | |
"grad_norm": 0.10407519891252137, | |
"learning_rate": 9.678829222311827e-06, | |
"loss": 1.13, | |
"step": 248 | |
}, | |
{ | |
"epoch": 1.6025641025641026, | |
"grad_norm": 0.11236395690738615, | |
"learning_rate": 9.671312196173413e-06, | |
"loss": 1.1493, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.6153846153846154, | |
"grad_norm": 0.1012523372817843, | |
"learning_rate": 9.663711204400872e-06, | |
"loss": 1.148, | |
"step": 252 | |
}, | |
{ | |
"epoch": 1.6282051282051282, | |
"grad_norm": 0.09652583778417714, | |
"learning_rate": 9.656026383620076e-06, | |
"loss": 1.1074, | |
"step": 254 | |
}, | |
{ | |
"epoch": 1.641025641025641, | |
"grad_norm": 0.09448533541138639, | |
"learning_rate": 9.6482578719637e-06, | |
"loss": 1.1486, | |
"step": 256 | |
}, | |
{ | |
"epoch": 1.6538461538461537, | |
"grad_norm": 0.09453430664055591, | |
"learning_rate": 9.640405809068743e-06, | |
"loss": 1.1197, | |
"step": 258 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"grad_norm": 0.0952812616531032, | |
"learning_rate": 9.632470336074009e-06, | |
"loss": 1.1337, | |
"step": 260 | |
}, | |
{ | |
"epoch": 1.6794871794871795, | |
"grad_norm": 0.09048018082770859, | |
"learning_rate": 9.624451595617588e-06, | |
"loss": 1.0885, | |
"step": 262 | |
}, | |
{ | |
"epoch": 1.6923076923076923, | |
"grad_norm": 0.0922717302732401, | |
"learning_rate": 9.616349731834271e-06, | |
"loss": 1.1294, | |
"step": 264 | |
}, | |
{ | |
"epoch": 1.7051282051282053, | |
"grad_norm": 0.09113342238000427, | |
"learning_rate": 9.608164890352977e-06, | |
"loss": 1.0871, | |
"step": 266 | |
}, | |
{ | |
"epoch": 1.717948717948718, | |
"grad_norm": 0.10188653395954697, | |
"learning_rate": 9.599897218294122e-06, | |
"loss": 1.1237, | |
"step": 268 | |
}, | |
{ | |
"epoch": 1.7307692307692308, | |
"grad_norm": 0.08946291041522332, | |
"learning_rate": 9.591546864266983e-06, | |
"loss": 1.1129, | |
"step": 270 | |
}, | |
{ | |
"epoch": 1.7435897435897436, | |
"grad_norm": 0.092702242157672, | |
"learning_rate": 9.583113978367026e-06, | |
"loss": 1.1089, | |
"step": 272 | |
}, | |
{ | |
"epoch": 1.7564102564102564, | |
"grad_norm": 0.1140491779513373, | |
"learning_rate": 9.574598712173202e-06, | |
"loss": 1.1286, | |
"step": 274 | |
}, | |
{ | |
"epoch": 1.7692307692307692, | |
"grad_norm": 0.09516237353719291, | |
"learning_rate": 9.56600121874523e-06, | |
"loss": 1.1122, | |
"step": 276 | |
}, | |
{ | |
"epoch": 1.782051282051282, | |
"grad_norm": 0.08916708413619781, | |
"learning_rate": 9.557321652620839e-06, | |
"loss": 1.1048, | |
"step": 278 | |
}, | |
{ | |
"epoch": 1.7948717948717947, | |
"grad_norm": 0.09140805156925046, | |
"learning_rate": 9.548560169812997e-06, | |
"loss": 1.1058, | |
"step": 280 | |
}, | |
{ | |
"epoch": 1.8076923076923077, | |
"grad_norm": 0.08683635001330178, | |
"learning_rate": 9.539716927807102e-06, | |
"loss": 1.0925, | |
"step": 282 | |
}, | |
{ | |
"epoch": 1.8205128205128205, | |
"grad_norm": 0.09284148179598711, | |
"learning_rate": 9.530792085558151e-06, | |
"loss": 1.0948, | |
"step": 284 | |
}, | |
{ | |
"epoch": 1.8333333333333335, | |
"grad_norm": 0.08800610945553744, | |
"learning_rate": 9.521785803487888e-06, | |
"loss": 1.1116, | |
"step": 286 | |
}, | |
{ | |
"epoch": 1.8461538461538463, | |
"grad_norm": 0.08758546749473674, | |
"learning_rate": 9.512698243481914e-06, | |
"loss": 1.1059, | |
"step": 288 | |
}, | |
{ | |
"epoch": 1.858974358974359, | |
"grad_norm": 0.08336608124209365, | |
"learning_rate": 9.50352956888678e-06, | |
"loss": 1.1015, | |
"step": 290 | |
}, | |
{ | |
"epoch": 1.8717948717948718, | |
"grad_norm": 0.09199580396288136, | |
"learning_rate": 9.49427994450705e-06, | |
"loss": 1.0828, | |
"step": 292 | |
}, | |
{ | |
"epoch": 1.8846153846153846, | |
"grad_norm": 0.5410940704298627, | |
"learning_rate": 9.484949536602343e-06, | |
"loss": 1.1412, | |
"step": 294 | |
}, | |
{ | |
"epoch": 1.8974358974358974, | |
"grad_norm": 0.08913430120295451, | |
"learning_rate": 9.47553851288434e-06, | |
"loss": 1.1073, | |
"step": 296 | |
}, | |
{ | |
"epoch": 1.9102564102564101, | |
"grad_norm": 0.09420167495815907, | |
"learning_rate": 9.466047042513767e-06, | |
"loss": 1.0957, | |
"step": 298 | |
}, | |
{ | |
"epoch": 1.9230769230769231, | |
"grad_norm": 0.08189970955203785, | |
"learning_rate": 9.45647529609736e-06, | |
"loss": 1.0909, | |
"step": 300 | |
}, | |
{ | |
"epoch": 1.935897435897436, | |
"grad_norm": 0.09065809775757692, | |
"learning_rate": 9.4468234456848e-06, | |
"loss": 1.0896, | |
"step": 302 | |
}, | |
{ | |
"epoch": 1.9487179487179487, | |
"grad_norm": 0.08763498764491487, | |
"learning_rate": 9.437091664765611e-06, | |
"loss": 1.1099, | |
"step": 304 | |
}, | |
{ | |
"epoch": 1.9615384615384617, | |
"grad_norm": 0.09257403574026254, | |
"learning_rate": 9.427280128266049e-06, | |
"loss": 1.1236, | |
"step": 306 | |
}, | |
{ | |
"epoch": 1.9743589743589745, | |
"grad_norm": 0.08983923370086075, | |
"learning_rate": 9.41738901254596e-06, | |
"loss": 1.0909, | |
"step": 308 | |
}, | |
{ | |
"epoch": 1.9871794871794872, | |
"grad_norm": 0.086289850522152, | |
"learning_rate": 9.4074184953956e-06, | |
"loss": 1.0942, | |
"step": 310 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 0.0874296283040965, | |
"learning_rate": 9.397368756032445e-06, | |
"loss": 1.0651, | |
"step": 312 | |
}, | |
{ | |
"epoch": 2.0128205128205128, | |
"grad_norm": 0.0848953888966574, | |
"learning_rate": 9.38723997509798e-06, | |
"loss": 1.0569, | |
"step": 314 | |
}, | |
{ | |
"epoch": 2.0256410256410255, | |
"grad_norm": 0.08790616172980993, | |
"learning_rate": 9.37703233465443e-06, | |
"loss": 1.035, | |
"step": 316 | |
}, | |
{ | |
"epoch": 2.0384615384615383, | |
"grad_norm": 0.08376355574572536, | |
"learning_rate": 9.366746018181503e-06, | |
"loss": 1.0379, | |
"step": 318 | |
}, | |
{ | |
"epoch": 2.051282051282051, | |
"grad_norm": 0.7353839032057593, | |
"learning_rate": 9.356381210573092e-06, | |
"loss": 1.0623, | |
"step": 320 | |
}, | |
{ | |
"epoch": 2.064102564102564, | |
"grad_norm": 0.09158722362975955, | |
"learning_rate": 9.345938098133946e-06, | |
"loss": 1.0264, | |
"step": 322 | |
}, | |
{ | |
"epoch": 2.076923076923077, | |
"grad_norm": 0.08819422670959466, | |
"learning_rate": 9.33541686857632e-06, | |
"loss": 1.0456, | |
"step": 324 | |
}, | |
{ | |
"epoch": 2.08974358974359, | |
"grad_norm": 0.0905819981621342, | |
"learning_rate": 9.324817711016609e-06, | |
"loss": 1.0239, | |
"step": 326 | |
}, | |
{ | |
"epoch": 2.1025641025641026, | |
"grad_norm": 0.08799589635983858, | |
"learning_rate": 9.31414081597194e-06, | |
"loss": 1.0498, | |
"step": 328 | |
}, | |
{ | |
"epoch": 2.1153846153846154, | |
"grad_norm": 0.0847927160084877, | |
"learning_rate": 9.303386375356752e-06, | |
"loss": 1.0163, | |
"step": 330 | |
}, | |
{ | |
"epoch": 2.128205128205128, | |
"grad_norm": 0.09169187613815971, | |
"learning_rate": 9.292554582479349e-06, | |
"loss": 1.0054, | |
"step": 332 | |
}, | |
{ | |
"epoch": 2.141025641025641, | |
"grad_norm": 0.08905293788047657, | |
"learning_rate": 9.281645632038417e-06, | |
"loss": 1.062, | |
"step": 334 | |
}, | |
{ | |
"epoch": 2.1538461538461537, | |
"grad_norm": 0.09229173633666073, | |
"learning_rate": 9.270659720119533e-06, | |
"loss": 1.039, | |
"step": 336 | |
}, | |
{ | |
"epoch": 2.1666666666666665, | |
"grad_norm": 0.08430144514732368, | |
"learning_rate": 9.259597044191635e-06, | |
"loss": 1.0268, | |
"step": 338 | |
}, | |
{ | |
"epoch": 2.1794871794871793, | |
"grad_norm": 0.08706427078942988, | |
"learning_rate": 9.248457803103476e-06, | |
"loss": 1.0038, | |
"step": 340 | |
}, | |
{ | |
"epoch": 2.1923076923076925, | |
"grad_norm": 0.0851666955740436, | |
"learning_rate": 9.237242197080045e-06, | |
"loss": 1.0218, | |
"step": 342 | |
}, | |
{ | |
"epoch": 2.2051282051282053, | |
"grad_norm": 0.08446573269728049, | |
"learning_rate": 9.225950427718974e-06, | |
"loss": 1.0254, | |
"step": 344 | |
}, | |
{ | |
"epoch": 2.217948717948718, | |
"grad_norm": 0.08907279788471897, | |
"learning_rate": 9.21458269798691e-06, | |
"loss": 0.9916, | |
"step": 346 | |
}, | |
{ | |
"epoch": 2.230769230769231, | |
"grad_norm": 0.09072043470187022, | |
"learning_rate": 9.203139212215868e-06, | |
"loss": 1.0103, | |
"step": 348 | |
}, | |
{ | |
"epoch": 2.2435897435897436, | |
"grad_norm": 0.08618586552830075, | |
"learning_rate": 9.191620176099559e-06, | |
"loss": 0.9995, | |
"step": 350 | |
}, | |
{ | |
"epoch": 2.2564102564102564, | |
"grad_norm": 0.09111342426909275, | |
"learning_rate": 9.180025796689692e-06, | |
"loss": 1.0292, | |
"step": 352 | |
}, | |
{ | |
"epoch": 2.269230769230769, | |
"grad_norm": 0.2022564482536435, | |
"learning_rate": 9.168356282392253e-06, | |
"loss": 1.0226, | |
"step": 354 | |
}, | |
{ | |
"epoch": 2.282051282051282, | |
"grad_norm": 0.1039362123101456, | |
"learning_rate": 9.156611842963753e-06, | |
"loss": 1.0152, | |
"step": 356 | |
}, | |
{ | |
"epoch": 2.2948717948717947, | |
"grad_norm": 0.10035717927769394, | |
"learning_rate": 9.144792689507471e-06, | |
"loss": 1.0049, | |
"step": 358 | |
}, | |
{ | |
"epoch": 2.3076923076923075, | |
"grad_norm": 0.08924064734394851, | |
"learning_rate": 9.132899034469648e-06, | |
"loss": 0.9962, | |
"step": 360 | |
}, | |
{ | |
"epoch": 2.3205128205128207, | |
"grad_norm": 0.09443040073005612, | |
"learning_rate": 9.120931091635669e-06, | |
"loss": 0.9976, | |
"step": 362 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"grad_norm": 0.09377508422363312, | |
"learning_rate": 9.108889076126226e-06, | |
"loss": 1.0306, | |
"step": 364 | |
}, | |
{ | |
"epoch": 2.3461538461538463, | |
"grad_norm": 0.0895229930946655, | |
"learning_rate": 9.09677320439345e-06, | |
"loss": 1.0126, | |
"step": 366 | |
}, | |
{ | |
"epoch": 2.358974358974359, | |
"grad_norm": 0.08795872722111464, | |
"learning_rate": 9.084583694217012e-06, | |
"loss": 0.9926, | |
"step": 368 | |
}, | |
{ | |
"epoch": 2.371794871794872, | |
"grad_norm": 0.08704560136887454, | |
"learning_rate": 9.072320764700223e-06, | |
"loss": 0.9978, | |
"step": 370 | |
}, | |
{ | |
"epoch": 2.3846153846153846, | |
"grad_norm": 0.0898387630341298, | |
"learning_rate": 9.059984636266082e-06, | |
"loss": 1.0042, | |
"step": 372 | |
}, | |
{ | |
"epoch": 2.3974358974358974, | |
"grad_norm": 0.08357247562762515, | |
"learning_rate": 9.047575530653324e-06, | |
"loss": 1.0094, | |
"step": 374 | |
}, | |
{ | |
"epoch": 2.41025641025641, | |
"grad_norm": 0.0843437057196144, | |
"learning_rate": 9.035093670912424e-06, | |
"loss": 0.9966, | |
"step": 376 | |
}, | |
{ | |
"epoch": 2.423076923076923, | |
"grad_norm": 0.08357196997203281, | |
"learning_rate": 9.022539281401601e-06, | |
"loss": 1.0038, | |
"step": 378 | |
}, | |
{ | |
"epoch": 2.435897435897436, | |
"grad_norm": 0.08859683961596204, | |
"learning_rate": 9.009912587782772e-06, | |
"loss": 1.0133, | |
"step": 380 | |
}, | |
{ | |
"epoch": 2.448717948717949, | |
"grad_norm": 0.09024266497375917, | |
"learning_rate": 8.997213817017508e-06, | |
"loss": 0.9782, | |
"step": 382 | |
}, | |
{ | |
"epoch": 2.4615384615384617, | |
"grad_norm": 0.0960929339414081, | |
"learning_rate": 8.984443197362938e-06, | |
"loss": 1.0013, | |
"step": 384 | |
}, | |
{ | |
"epoch": 2.4743589743589745, | |
"grad_norm": 0.08862629313408348, | |
"learning_rate": 8.971600958367668e-06, | |
"loss": 1.0059, | |
"step": 386 | |
}, | |
{ | |
"epoch": 2.4871794871794872, | |
"grad_norm": 0.09201716039902362, | |
"learning_rate": 8.958687330867634e-06, | |
"loss": 1.0263, | |
"step": 388 | |
}, | |
{ | |
"epoch": 2.5, | |
"grad_norm": 0.08694363384662504, | |
"learning_rate": 8.94570254698197e-06, | |
"loss": 1.0163, | |
"step": 390 | |
}, | |
{ | |
"epoch": 2.5128205128205128, | |
"grad_norm": 0.09205164914341211, | |
"learning_rate": 8.932646840108818e-06, | |
"loss": 0.9865, | |
"step": 392 | |
}, | |
{ | |
"epoch": 2.5256410256410255, | |
"grad_norm": 0.09081872370987605, | |
"learning_rate": 8.919520444921153e-06, | |
"loss": 0.9819, | |
"step": 394 | |
}, | |
{ | |
"epoch": 2.5384615384615383, | |
"grad_norm": 0.08905442630582544, | |
"learning_rate": 8.906323597362547e-06, | |
"loss": 1.0171, | |
"step": 396 | |
}, | |
{ | |
"epoch": 2.551282051282051, | |
"grad_norm": 0.08717951944686292, | |
"learning_rate": 8.893056534642938e-06, | |
"loss": 1.0244, | |
"step": 398 | |
}, | |
{ | |
"epoch": 2.564102564102564, | |
"grad_norm": 0.09573458066741532, | |
"learning_rate": 8.879719495234363e-06, | |
"loss": 0.9848, | |
"step": 400 | |
}, | |
{ | |
"epoch": 2.5769230769230766, | |
"grad_norm": 0.0898624666623644, | |
"learning_rate": 8.866312718866669e-06, | |
"loss": 0.982, | |
"step": 402 | |
}, | |
{ | |
"epoch": 2.58974358974359, | |
"grad_norm": 0.09305658353350323, | |
"learning_rate": 8.852836446523213e-06, | |
"loss": 0.9742, | |
"step": 404 | |
}, | |
{ | |
"epoch": 2.6025641025641026, | |
"grad_norm": 0.08663704229153721, | |
"learning_rate": 8.83929092043652e-06, | |
"loss": 0.9783, | |
"step": 406 | |
}, | |
{ | |
"epoch": 2.6153846153846154, | |
"grad_norm": 0.08983846726156959, | |
"learning_rate": 8.825676384083936e-06, | |
"loss": 0.998, | |
"step": 408 | |
}, | |
{ | |
"epoch": 2.628205128205128, | |
"grad_norm": 0.09388895481313425, | |
"learning_rate": 8.811993082183243e-06, | |
"loss": 1.0005, | |
"step": 410 | |
}, | |
{ | |
"epoch": 2.641025641025641, | |
"grad_norm": 0.09226783931828283, | |
"learning_rate": 8.798241260688273e-06, | |
"loss": 1.0055, | |
"step": 412 | |
}, | |
{ | |
"epoch": 2.6538461538461537, | |
"grad_norm": 0.09021054214140613, | |
"learning_rate": 8.784421166784476e-06, | |
"loss": 0.9981, | |
"step": 414 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"grad_norm": 0.0860573848233807, | |
"learning_rate": 8.770533048884483e-06, | |
"loss": 1.0017, | |
"step": 416 | |
}, | |
{ | |
"epoch": 2.6794871794871797, | |
"grad_norm": 0.0880124822318372, | |
"learning_rate": 8.756577156623636e-06, | |
"loss": 0.9834, | |
"step": 418 | |
}, | |
{ | |
"epoch": 2.6923076923076925, | |
"grad_norm": 0.0867421199146975, | |
"learning_rate": 8.742553740855507e-06, | |
"loss": 0.9983, | |
"step": 420 | |
}, | |
{ | |
"epoch": 2.7051282051282053, | |
"grad_norm": 0.09006077507273828, | |
"learning_rate": 8.728463053647382e-06, | |
"loss": 0.9702, | |
"step": 422 | |
}, | |
{ | |
"epoch": 2.717948717948718, | |
"grad_norm": 0.08669250030062742, | |
"learning_rate": 8.71430534827574e-06, | |
"loss": 0.9952, | |
"step": 424 | |
}, | |
{ | |
"epoch": 2.730769230769231, | |
"grad_norm": 0.09026424854741899, | |
"learning_rate": 8.700080879221689e-06, | |
"loss": 1.0054, | |
"step": 426 | |
}, | |
{ | |
"epoch": 2.7435897435897436, | |
"grad_norm": 0.087975640704094, | |
"learning_rate": 8.685789902166395e-06, | |
"loss": 0.9845, | |
"step": 428 | |
}, | |
{ | |
"epoch": 2.7564102564102564, | |
"grad_norm": 0.08642431755631451, | |
"learning_rate": 8.671432673986493e-06, | |
"loss": 0.9791, | |
"step": 430 | |
}, | |
{ | |
"epoch": 2.769230769230769, | |
"grad_norm": 0.08649701419340423, | |
"learning_rate": 8.657009452749466e-06, | |
"loss": 0.9752, | |
"step": 432 | |
}, | |
{ | |
"epoch": 2.782051282051282, | |
"grad_norm": 0.0879183947838203, | |
"learning_rate": 8.642520497709001e-06, | |
"loss": 0.9788, | |
"step": 434 | |
}, | |
{ | |
"epoch": 2.7948717948717947, | |
"grad_norm": 0.08596416297337815, | |
"learning_rate": 8.627966069300332e-06, | |
"loss": 0.9807, | |
"step": 436 | |
}, | |
{ | |
"epoch": 2.8076923076923075, | |
"grad_norm": 0.08918860363970792, | |
"learning_rate": 8.613346429135567e-06, | |
"loss": 0.9958, | |
"step": 438 | |
}, | |
{ | |
"epoch": 2.8205128205128203, | |
"grad_norm": 0.08972585580799317, | |
"learning_rate": 8.598661839998972e-06, | |
"loss": 0.9895, | |
"step": 440 | |
}, | |
{ | |
"epoch": 2.8333333333333335, | |
"grad_norm": 0.08703685151364528, | |
"learning_rate": 8.583912565842258e-06, | |
"loss": 0.9652, | |
"step": 442 | |
}, | |
{ | |
"epoch": 2.8461538461538463, | |
"grad_norm": 0.08688465565057563, | |
"learning_rate": 8.569098871779828e-06, | |
"loss": 0.9984, | |
"step": 444 | |
}, | |
{ | |
"epoch": 2.858974358974359, | |
"grad_norm": 0.08809758545326962, | |
"learning_rate": 8.554221024084019e-06, | |
"loss": 0.9905, | |
"step": 446 | |
}, | |
{ | |
"epoch": 2.871794871794872, | |
"grad_norm": 0.08572911529655777, | |
"learning_rate": 8.539279290180315e-06, | |
"loss": 0.9692, | |
"step": 448 | |
}, | |
{ | |
"epoch": 2.8846153846153846, | |
"grad_norm": 0.08836722634323343, | |
"learning_rate": 8.524273938642539e-06, | |
"loss": 0.9547, | |
"step": 450 | |
}, | |
{ | |
"epoch": 2.8974358974358974, | |
"grad_norm": 0.09242854914045788, | |
"learning_rate": 8.509205239188017e-06, | |
"loss": 0.9838, | |
"step": 452 | |
}, | |
{ | |
"epoch": 2.91025641025641, | |
"grad_norm": 0.08849881930024005, | |
"learning_rate": 8.494073462672743e-06, | |
"loss": 0.9615, | |
"step": 454 | |
}, | |
{ | |
"epoch": 2.9230769230769234, | |
"grad_norm": 0.08854620618403236, | |
"learning_rate": 8.478878881086505e-06, | |
"loss": 0.9977, | |
"step": 456 | |
}, | |
{ | |
"epoch": 2.935897435897436, | |
"grad_norm": 0.094665430731143, | |
"learning_rate": 8.463621767547998e-06, | |
"loss": 0.9927, | |
"step": 458 | |
}, | |
{ | |
"epoch": 2.948717948717949, | |
"grad_norm": 0.09196410792880014, | |
"learning_rate": 8.448302396299906e-06, | |
"loss": 1.0113, | |
"step": 460 | |
}, | |
{ | |
"epoch": 2.9615384615384617, | |
"grad_norm": 0.09036486236859728, | |
"learning_rate": 8.432921042703985e-06, | |
"loss": 0.9457, | |
"step": 462 | |
}, | |
{ | |
"epoch": 2.9743589743589745, | |
"grad_norm": 0.08576032950610284, | |
"learning_rate": 8.417477983236107e-06, | |
"loss": 0.9645, | |
"step": 464 | |
}, | |
{ | |
"epoch": 2.9871794871794872, | |
"grad_norm": 0.08403590001526823, | |
"learning_rate": 8.401973495481289e-06, | |
"loss": 0.9544, | |
"step": 466 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 0.09355532269950335, | |
"learning_rate": 8.386407858128707e-06, | |
"loss": 0.9719, | |
"step": 468 | |
}, | |
{ | |
"epoch": 3.0128205128205128, | |
"grad_norm": 0.08685232548889178, | |
"learning_rate": 8.370781350966683e-06, | |
"loss": 0.8933, | |
"step": 470 | |
}, | |
{ | |
"epoch": 3.0256410256410255, | |
"grad_norm": 0.10917681684685593, | |
"learning_rate": 8.355094254877665e-06, | |
"loss": 0.9222, | |
"step": 472 | |
}, | |
{ | |
"epoch": 3.0384615384615383, | |
"grad_norm": 0.09821414680349456, | |
"learning_rate": 8.339346851833163e-06, | |
"loss": 0.9187, | |
"step": 474 | |
}, | |
{ | |
"epoch": 3.051282051282051, | |
"grad_norm": 0.0953257584501641, | |
"learning_rate": 8.323539424888695e-06, | |
"loss": 0.9068, | |
"step": 476 | |
}, | |
{ | |
"epoch": 3.064102564102564, | |
"grad_norm": 0.10096821936698265, | |
"learning_rate": 8.30767225817869e-06, | |
"loss": 0.9005, | |
"step": 478 | |
}, | |
{ | |
"epoch": 3.076923076923077, | |
"grad_norm": 0.09745049198474258, | |
"learning_rate": 8.291745636911382e-06, | |
"loss": 0.8955, | |
"step": 480 | |
}, | |
{ | |
"epoch": 3.08974358974359, | |
"grad_norm": 0.09581071499737452, | |
"learning_rate": 8.27575984736369e-06, | |
"loss": 0.9034, | |
"step": 482 | |
}, | |
{ | |
"epoch": 3.1025641025641026, | |
"grad_norm": 0.09048589565605356, | |
"learning_rate": 8.259715176876069e-06, | |
"loss": 0.8964, | |
"step": 484 | |
}, | |
{ | |
"epoch": 3.1153846153846154, | |
"grad_norm": 0.09408149538192938, | |
"learning_rate": 8.243611913847337e-06, | |
"loss": 0.9157, | |
"step": 486 | |
}, | |
{ | |
"epoch": 3.128205128205128, | |
"grad_norm": 0.0947487050346647, | |
"learning_rate": 8.2274503477295e-06, | |
"loss": 0.9053, | |
"step": 488 | |
}, | |
{ | |
"epoch": 3.141025641025641, | |
"grad_norm": 0.09366500902355888, | |
"learning_rate": 8.211230769022552e-06, | |
"loss": 0.8925, | |
"step": 490 | |
}, | |
{ | |
"epoch": 3.1538461538461537, | |
"grad_norm": 0.09167161100151112, | |
"learning_rate": 8.19495346926924e-06, | |
"loss": 0.9165, | |
"step": 492 | |
}, | |
{ | |
"epoch": 3.1666666666666665, | |
"grad_norm": 0.09307041831758973, | |
"learning_rate": 8.178618741049841e-06, | |
"loss": 0.8989, | |
"step": 494 | |
}, | |
{ | |
"epoch": 3.1794871794871793, | |
"grad_norm": 0.09585560939367876, | |
"learning_rate": 8.162226877976886e-06, | |
"loss": 0.9147, | |
"step": 496 | |
}, | |
{ | |
"epoch": 3.1923076923076925, | |
"grad_norm": 0.09180060088840723, | |
"learning_rate": 8.145778174689897e-06, | |
"loss": 0.8882, | |
"step": 498 | |
}, | |
{ | |
"epoch": 3.2051282051282053, | |
"grad_norm": 0.09609878354099273, | |
"learning_rate": 8.129272926850079e-06, | |
"loss": 0.8744, | |
"step": 500 | |
}, | |
{ | |
"epoch": 3.217948717948718, | |
"grad_norm": 0.09691473472460625, | |
"learning_rate": 8.112711431135014e-06, | |
"loss": 0.8736, | |
"step": 502 | |
}, | |
{ | |
"epoch": 3.230769230769231, | |
"grad_norm": 0.09236636322834278, | |
"learning_rate": 8.096093985233323e-06, | |
"loss": 0.848, | |
"step": 504 | |
}, | |
{ | |
"epoch": 3.2435897435897436, | |
"grad_norm": 0.09704717599279773, | |
"learning_rate": 8.079420887839316e-06, | |
"loss": 0.8844, | |
"step": 506 | |
}, | |
{ | |
"epoch": 3.2564102564102564, | |
"grad_norm": 0.09939291409466518, | |
"learning_rate": 8.062692438647628e-06, | |
"loss": 0.8866, | |
"step": 508 | |
}, | |
{ | |
"epoch": 3.269230769230769, | |
"grad_norm": 0.09353962075083472, | |
"learning_rate": 8.045908938347828e-06, | |
"loss": 0.8742, | |
"step": 510 | |
}, | |
{ | |
"epoch": 3.282051282051282, | |
"grad_norm": 0.09465310178443197, | |
"learning_rate": 8.029070688619013e-06, | |
"loss": 0.8833, | |
"step": 512 | |
}, | |
{ | |
"epoch": 3.2948717948717947, | |
"grad_norm": 0.09443637715651476, | |
"learning_rate": 8.012177992124385e-06, | |
"loss": 0.8794, | |
"step": 514 | |
}, | |
{ | |
"epoch": 3.3076923076923075, | |
"grad_norm": 0.09728431520292821, | |
"learning_rate": 7.995231152505815e-06, | |
"loss": 0.8732, | |
"step": 516 | |
}, | |
{ | |
"epoch": 3.3205128205128207, | |
"grad_norm": 0.09428493650909285, | |
"learning_rate": 7.978230474378383e-06, | |
"loss": 0.8597, | |
"step": 518 | |
}, | |
{ | |
"epoch": 3.3333333333333335, | |
"grad_norm": 0.09850772889396305, | |
"learning_rate": 7.961176263324902e-06, | |
"loss": 0.8624, | |
"step": 520 | |
}, | |
{ | |
"epoch": 3.3461538461538463, | |
"grad_norm": 0.09087037549609535, | |
"learning_rate": 7.944068825890424e-06, | |
"loss": 0.8821, | |
"step": 522 | |
}, | |
{ | |
"epoch": 3.358974358974359, | |
"grad_norm": 0.09180369503983593, | |
"learning_rate": 7.92690846957673e-06, | |
"loss": 0.8688, | |
"step": 524 | |
}, | |
{ | |
"epoch": 3.371794871794872, | |
"grad_norm": 0.09491604280681391, | |
"learning_rate": 7.909695502836814e-06, | |
"loss": 0.8647, | |
"step": 526 | |
}, | |
{ | |
"epoch": 3.3846153846153846, | |
"grad_norm": 0.09921876854138406, | |
"learning_rate": 7.892430235069317e-06, | |
"loss": 0.8869, | |
"step": 528 | |
}, | |
{ | |
"epoch": 3.3974358974358974, | |
"grad_norm": 0.09457741703712105, | |
"learning_rate": 7.875112976612984e-06, | |
"loss": 0.8639, | |
"step": 530 | |
}, | |
{ | |
"epoch": 3.41025641025641, | |
"grad_norm": 0.09583219613481893, | |
"learning_rate": 7.857744038741076e-06, | |
"loss": 0.8805, | |
"step": 532 | |
}, | |
{ | |
"epoch": 3.423076923076923, | |
"grad_norm": 0.09260516206658106, | |
"learning_rate": 7.84032373365578e-06, | |
"loss": 0.8603, | |
"step": 534 | |
}, | |
{ | |
"epoch": 3.435897435897436, | |
"grad_norm": 0.09932108403192164, | |
"learning_rate": 7.822852374482597e-06, | |
"loss": 0.8658, | |
"step": 536 | |
}, | |
{ | |
"epoch": 3.448717948717949, | |
"grad_norm": 0.09728531208245553, | |
"learning_rate": 7.805330275264707e-06, | |
"loss": 0.8536, | |
"step": 538 | |
}, | |
{ | |
"epoch": 3.4615384615384617, | |
"grad_norm": 0.09952432033061036, | |
"learning_rate": 7.787757750957335e-06, | |
"loss": 0.8763, | |
"step": 540 | |
}, | |
{ | |
"epoch": 3.4743589743589745, | |
"grad_norm": 0.09845329832112057, | |
"learning_rate": 7.77013511742208e-06, | |
"loss": 0.8658, | |
"step": 542 | |
}, | |
{ | |
"epoch": 3.4871794871794872, | |
"grad_norm": 0.10349699075619775, | |
"learning_rate": 7.752462691421245e-06, | |
"loss": 0.8538, | |
"step": 544 | |
}, | |
{ | |
"epoch": 3.5, | |
"grad_norm": 0.15469316317671902, | |
"learning_rate": 7.734740790612137e-06, | |
"loss": 0.8644, | |
"step": 546 | |
}, | |
{ | |
"epoch": 3.5128205128205128, | |
"grad_norm": 0.09649309700047885, | |
"learning_rate": 7.716969733541357e-06, | |
"loss": 0.8755, | |
"step": 548 | |
}, | |
{ | |
"epoch": 3.5256410256410255, | |
"grad_norm": 0.09860823779259517, | |
"learning_rate": 7.699149839639086e-06, | |
"loss": 0.8471, | |
"step": 550 | |
}, | |
{ | |
"epoch": 3.5384615384615383, | |
"grad_norm": 0.09867635522074884, | |
"learning_rate": 7.681281429213328e-06, | |
"loss": 0.8512, | |
"step": 552 | |
}, | |
{ | |
"epoch": 3.551282051282051, | |
"grad_norm": 0.09856703594780034, | |
"learning_rate": 7.663364823444157e-06, | |
"loss": 0.8581, | |
"step": 554 | |
}, | |
{ | |
"epoch": 3.564102564102564, | |
"grad_norm": 0.10120010505390695, | |
"learning_rate": 7.645400344377953e-06, | |
"loss": 0.8647, | |
"step": 556 | |
}, | |
{ | |
"epoch": 3.5769230769230766, | |
"grad_norm": 0.09353647856294549, | |
"learning_rate": 7.627388314921602e-06, | |
"loss": 0.8563, | |
"step": 558 | |
}, | |
{ | |
"epoch": 3.58974358974359, | |
"grad_norm": 0.097727849555005, | |
"learning_rate": 7.609329058836694e-06, | |
"loss": 0.8629, | |
"step": 560 | |
}, | |
{ | |
"epoch": 3.6025641025641026, | |
"grad_norm": 0.09185843649741915, | |
"learning_rate": 7.59122290073371e-06, | |
"loss": 0.8517, | |
"step": 562 | |
}, | |
{ | |
"epoch": 3.6153846153846154, | |
"grad_norm": 0.16467906411387448, | |
"learning_rate": 7.5730701660661795e-06, | |
"loss": 0.8588, | |
"step": 564 | |
}, | |
{ | |
"epoch": 3.628205128205128, | |
"grad_norm": 0.10490078157659109, | |
"learning_rate": 7.554871181124836e-06, | |
"loss": 0.8916, | |
"step": 566 | |
}, | |
{ | |
"epoch": 3.641025641025641, | |
"grad_norm": 0.09862237486460196, | |
"learning_rate": 7.536626273031747e-06, | |
"loss": 0.8486, | |
"step": 568 | |
}, | |
{ | |
"epoch": 3.6538461538461537, | |
"grad_norm": 0.09855168103779419, | |
"learning_rate": 7.5183357697344395e-06, | |
"loss": 0.8532, | |
"step": 570 | |
}, | |
{ | |
"epoch": 3.6666666666666665, | |
"grad_norm": 0.09943631897387811, | |
"learning_rate": 7.500000000000001e-06, | |
"loss": 0.8643, | |
"step": 572 | |
}, | |
{ | |
"epoch": 3.6794871794871797, | |
"grad_norm": 0.09470558794565637, | |
"learning_rate": 7.481619293409173e-06, | |
"loss": 0.8705, | |
"step": 574 | |
}, | |
{ | |
"epoch": 3.6923076923076925, | |
"grad_norm": 0.09434833275033037, | |
"learning_rate": 7.4631939803504215e-06, | |
"loss": 0.8597, | |
"step": 576 | |
}, | |
{ | |
"epoch": 3.7051282051282053, | |
"grad_norm": 0.09852625213361811, | |
"learning_rate": 7.44472439201401e-06, | |
"loss": 0.8665, | |
"step": 578 | |
}, | |
{ | |
"epoch": 3.717948717948718, | |
"grad_norm": 0.09522012579767557, | |
"learning_rate": 7.426210860386032e-06, | |
"loss": 0.8373, | |
"step": 580 | |
}, | |
{ | |
"epoch": 3.730769230769231, | |
"grad_norm": 0.09872214935386595, | |
"learning_rate": 7.407653718242449e-06, | |
"loss": 0.8266, | |
"step": 582 | |
}, | |
{ | |
"epoch": 3.7435897435897436, | |
"grad_norm": 0.09611754066886699, | |
"learning_rate": 7.3890532991431174e-06, | |
"loss": 0.8422, | |
"step": 584 | |
}, | |
{ | |
"epoch": 3.7564102564102564, | |
"grad_norm": 0.09430702389773353, | |
"learning_rate": 7.370409937425781e-06, | |
"loss": 0.8349, | |
"step": 586 | |
}, | |
{ | |
"epoch": 3.769230769230769, | |
"grad_norm": 0.10000120202753963, | |
"learning_rate": 7.3517239682000675e-06, | |
"loss": 0.8589, | |
"step": 588 | |
}, | |
{ | |
"epoch": 3.782051282051282, | |
"grad_norm": 0.09477208728170344, | |
"learning_rate": 7.332995727341462e-06, | |
"loss": 0.8591, | |
"step": 590 | |
}, | |
{ | |
"epoch": 3.7948717948717947, | |
"grad_norm": 0.09696166000717225, | |
"learning_rate": 7.314225551485273e-06, | |
"loss": 0.8397, | |
"step": 592 | |
}, | |
{ | |
"epoch": 3.8076923076923075, | |
"grad_norm": 0.09621353397155066, | |
"learning_rate": 7.295413778020579e-06, | |
"loss": 0.8166, | |
"step": 594 | |
}, | |
{ | |
"epoch": 3.8205128205128203, | |
"grad_norm": 0.09692687114207367, | |
"learning_rate": 7.276560745084167e-06, | |
"loss": 0.8521, | |
"step": 596 | |
}, | |
{ | |
"epoch": 3.8333333333333335, | |
"grad_norm": 0.09885126357081214, | |
"learning_rate": 7.257666791554448e-06, | |
"loss": 0.8416, | |
"step": 598 | |
}, | |
{ | |
"epoch": 3.8461538461538463, | |
"grad_norm": 0.10239714078021848, | |
"learning_rate": 7.2387322570453724e-06, | |
"loss": 0.8324, | |
"step": 600 | |
}, | |
{ | |
"epoch": 3.858974358974359, | |
"grad_norm": 0.11251898784242197, | |
"learning_rate": 7.219757481900325e-06, | |
"loss": 0.835, | |
"step": 602 | |
}, | |
{ | |
"epoch": 3.871794871794872, | |
"grad_norm": 0.1005799166719958, | |
"learning_rate": 7.2007428071860045e-06, | |
"loss": 0.8035, | |
"step": 604 | |
}, | |
{ | |
"epoch": 3.8846153846153846, | |
"grad_norm": 0.10103534145014936, | |
"learning_rate": 7.181688574686292e-06, | |
"loss": 0.8709, | |
"step": 606 | |
}, | |
{ | |
"epoch": 3.8974358974358974, | |
"grad_norm": 0.10027552225015914, | |
"learning_rate": 7.162595126896111e-06, | |
"loss": 0.8319, | |
"step": 608 | |
}, | |
{ | |
"epoch": 3.91025641025641, | |
"grad_norm": 0.10075780749863547, | |
"learning_rate": 7.143462807015271e-06, | |
"loss": 0.8323, | |
"step": 610 | |
}, | |
{ | |
"epoch": 3.9230769230769234, | |
"grad_norm": 0.09472929060217589, | |
"learning_rate": 7.1242919589422974e-06, | |
"loss": 0.8185, | |
"step": 612 | |
}, | |
{ | |
"epoch": 3.935897435897436, | |
"grad_norm": 0.09472378350788888, | |
"learning_rate": 7.105082927268247e-06, | |
"loss": 0.8304, | |
"step": 614 | |
}, | |
{ | |
"epoch": 3.948717948717949, | |
"grad_norm": 0.10337359146731352, | |
"learning_rate": 7.085836057270521e-06, | |
"loss": 0.8174, | |
"step": 616 | |
}, | |
{ | |
"epoch": 3.9615384615384617, | |
"grad_norm": 0.0983672088113577, | |
"learning_rate": 7.066551694906651e-06, | |
"loss": 0.8322, | |
"step": 618 | |
}, | |
{ | |
"epoch": 3.9743589743589745, | |
"grad_norm": 0.1019500525911841, | |
"learning_rate": 7.047230186808085e-06, | |
"loss": 0.8021, | |
"step": 620 | |
}, | |
{ | |
"epoch": 3.9871794871794872, | |
"grad_norm": 0.09750574300751329, | |
"learning_rate": 7.027871880273959e-06, | |
"loss": 0.7983, | |
"step": 622 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 0.10208128186441004, | |
"learning_rate": 7.008477123264849e-06, | |
"loss": 0.8239, | |
"step": 624 | |
}, | |
{ | |
"epoch": 4.012820512820513, | |
"grad_norm": 0.10734300522197977, | |
"learning_rate": 6.989046264396516e-06, | |
"loss": 0.7678, | |
"step": 626 | |
}, | |
{ | |
"epoch": 4.0256410256410255, | |
"grad_norm": 0.102980617519378, | |
"learning_rate": 6.96957965293365e-06, | |
"loss": 0.7377, | |
"step": 628 | |
}, | |
{ | |
"epoch": 4.038461538461538, | |
"grad_norm": 0.12545611352143843, | |
"learning_rate": 6.9500776387835785e-06, | |
"loss": 0.7581, | |
"step": 630 | |
}, | |
{ | |
"epoch": 4.051282051282051, | |
"grad_norm": 0.122707057481331, | |
"learning_rate": 6.9305405724899876e-06, | |
"loss": 0.7399, | |
"step": 632 | |
}, | |
{ | |
"epoch": 4.064102564102564, | |
"grad_norm": 0.11397293701236821, | |
"learning_rate": 6.91096880522661e-06, | |
"loss": 0.7447, | |
"step": 634 | |
}, | |
{ | |
"epoch": 4.076923076923077, | |
"grad_norm": 0.13487306338562802, | |
"learning_rate": 6.891362688790925e-06, | |
"loss": 0.7546, | |
"step": 636 | |
}, | |
{ | |
"epoch": 4.089743589743589, | |
"grad_norm": 0.10896326697255375, | |
"learning_rate": 6.871722575597829e-06, | |
"loss": 0.7579, | |
"step": 638 | |
}, | |
{ | |
"epoch": 4.102564102564102, | |
"grad_norm": 0.11165624709106642, | |
"learning_rate": 6.8520488186733e-06, | |
"loss": 0.7517, | |
"step": 640 | |
}, | |
{ | |
"epoch": 4.115384615384615, | |
"grad_norm": 0.11518303790398043, | |
"learning_rate": 6.832341771648057e-06, | |
"loss": 0.7459, | |
"step": 642 | |
}, | |
{ | |
"epoch": 4.128205128205128, | |
"grad_norm": 0.11119475069076129, | |
"learning_rate": 6.812601788751192e-06, | |
"loss": 0.7825, | |
"step": 644 | |
}, | |
{ | |
"epoch": 4.141025641025641, | |
"grad_norm": 0.10743202492055963, | |
"learning_rate": 6.792829224803816e-06, | |
"loss": 0.7395, | |
"step": 646 | |
}, | |
{ | |
"epoch": 4.153846153846154, | |
"grad_norm": 0.10582985557685172, | |
"learning_rate": 6.773024435212678e-06, | |
"loss": 0.7637, | |
"step": 648 | |
}, | |
{ | |
"epoch": 4.166666666666667, | |
"grad_norm": 0.10835455281881788, | |
"learning_rate": 6.753187775963773e-06, | |
"loss": 0.7576, | |
"step": 650 | |
}, | |
{ | |
"epoch": 4.17948717948718, | |
"grad_norm": 0.1107213708183791, | |
"learning_rate": 6.733319603615941e-06, | |
"loss": 0.7519, | |
"step": 652 | |
}, | |
{ | |
"epoch": 4.1923076923076925, | |
"grad_norm": 0.11143239841237282, | |
"learning_rate": 6.713420275294467e-06, | |
"loss": 0.7421, | |
"step": 654 | |
}, | |
{ | |
"epoch": 4.205128205128205, | |
"grad_norm": 0.10135913047939792, | |
"learning_rate": 6.693490148684654e-06, | |
"loss": 0.7503, | |
"step": 656 | |
}, | |
{ | |
"epoch": 4.217948717948718, | |
"grad_norm": 0.10935890173613132, | |
"learning_rate": 6.673529582025398e-06, | |
"loss": 0.7469, | |
"step": 658 | |
}, | |
{ | |
"epoch": 4.230769230769231, | |
"grad_norm": 0.10682800250997206, | |
"learning_rate": 6.653538934102743e-06, | |
"loss": 0.7519, | |
"step": 660 | |
}, | |
{ | |
"epoch": 4.243589743589744, | |
"grad_norm": 0.11174312070750286, | |
"learning_rate": 6.633518564243442e-06, | |
"loss": 0.7388, | |
"step": 662 | |
}, | |
{ | |
"epoch": 4.256410256410256, | |
"grad_norm": 0.10996882792698588, | |
"learning_rate": 6.6134688323084884e-06, | |
"loss": 0.735, | |
"step": 664 | |
}, | |
{ | |
"epoch": 4.269230769230769, | |
"grad_norm": 0.11513381552989353, | |
"learning_rate": 6.593390098686653e-06, | |
"loss": 0.7266, | |
"step": 666 | |
}, | |
{ | |
"epoch": 4.282051282051282, | |
"grad_norm": 0.10383307615951057, | |
"learning_rate": 6.573282724288001e-06, | |
"loss": 0.7354, | |
"step": 668 | |
}, | |
{ | |
"epoch": 4.294871794871795, | |
"grad_norm": 0.10064526192695795, | |
"learning_rate": 6.553147070537413e-06, | |
"loss": 0.7316, | |
"step": 670 | |
}, | |
{ | |
"epoch": 4.3076923076923075, | |
"grad_norm": 0.10546529880700707, | |
"learning_rate": 6.532983499368078e-06, | |
"loss": 0.7345, | |
"step": 672 | |
}, | |
{ | |
"epoch": 4.32051282051282, | |
"grad_norm": 0.10452514955349174, | |
"learning_rate": 6.512792373215e-06, | |
"loss": 0.7552, | |
"step": 674 | |
}, | |
{ | |
"epoch": 4.333333333333333, | |
"grad_norm": 0.10501851155628895, | |
"learning_rate": 6.492574055008474e-06, | |
"loss": 0.715, | |
"step": 676 | |
}, | |
{ | |
"epoch": 4.346153846153846, | |
"grad_norm": 0.10411036896818421, | |
"learning_rate": 6.472328908167562e-06, | |
"loss": 0.729, | |
"step": 678 | |
}, | |
{ | |
"epoch": 4.358974358974359, | |
"grad_norm": 0.11127049713049718, | |
"learning_rate": 6.452057296593568e-06, | |
"loss": 0.744, | |
"step": 680 | |
}, | |
{ | |
"epoch": 4.371794871794872, | |
"grad_norm": 0.12676881136201423, | |
"learning_rate": 6.431759584663492e-06, | |
"loss": 0.7588, | |
"step": 682 | |
}, | |
{ | |
"epoch": 4.384615384615385, | |
"grad_norm": 0.105870619579206, | |
"learning_rate": 6.411436137223479e-06, | |
"loss": 0.7247, | |
"step": 684 | |
}, | |
{ | |
"epoch": 4.397435897435898, | |
"grad_norm": 0.10374120826824249, | |
"learning_rate": 6.391087319582264e-06, | |
"loss": 0.7309, | |
"step": 686 | |
}, | |
{ | |
"epoch": 4.410256410256411, | |
"grad_norm": 0.10865846153479375, | |
"learning_rate": 6.370713497504607e-06, | |
"loss": 0.7482, | |
"step": 688 | |
}, | |
{ | |
"epoch": 4.423076923076923, | |
"grad_norm": 0.11160085810481411, | |
"learning_rate": 6.350315037204714e-06, | |
"loss": 0.7254, | |
"step": 690 | |
}, | |
{ | |
"epoch": 4.435897435897436, | |
"grad_norm": 0.10544486611527323, | |
"learning_rate": 6.329892305339659e-06, | |
"loss": 0.7053, | |
"step": 692 | |
}, | |
{ | |
"epoch": 4.448717948717949, | |
"grad_norm": 0.10611707780750092, | |
"learning_rate": 6.309445669002787e-06, | |
"loss": 0.7078, | |
"step": 694 | |
}, | |
{ | |
"epoch": 4.461538461538462, | |
"grad_norm": 0.10588157071847835, | |
"learning_rate": 6.288975495717124e-06, | |
"loss": 0.7412, | |
"step": 696 | |
}, | |
{ | |
"epoch": 4.4743589743589745, | |
"grad_norm": 0.10785564192135899, | |
"learning_rate": 6.268482153428763e-06, | |
"loss": 0.7289, | |
"step": 698 | |
}, | |
{ | |
"epoch": 4.487179487179487, | |
"grad_norm": 0.10456174831291559, | |
"learning_rate": 6.247966010500258e-06, | |
"loss": 0.7233, | |
"step": 700 | |
}, | |
{ | |
"epoch": 4.5, | |
"grad_norm": 0.10739198046560715, | |
"learning_rate": 6.227427435703997e-06, | |
"loss": 0.7308, | |
"step": 702 | |
}, | |
{ | |
"epoch": 4.512820512820513, | |
"grad_norm": 0.11062331534549659, | |
"learning_rate": 6.206866798215571e-06, | |
"loss": 0.7188, | |
"step": 704 | |
}, | |
{ | |
"epoch": 4.5256410256410255, | |
"grad_norm": 0.1120412879852177, | |
"learning_rate": 6.186284467607149e-06, | |
"loss": 0.7149, | |
"step": 706 | |
}, | |
{ | |
"epoch": 4.538461538461538, | |
"grad_norm": 0.10581044212948068, | |
"learning_rate": 6.165680813840822e-06, | |
"loss": 0.7286, | |
"step": 708 | |
}, | |
{ | |
"epoch": 4.551282051282051, | |
"grad_norm": 0.10581925858925155, | |
"learning_rate": 6.1450562072619635e-06, | |
"loss": 0.6854, | |
"step": 710 | |
}, | |
{ | |
"epoch": 4.564102564102564, | |
"grad_norm": 0.11850925857398317, | |
"learning_rate": 6.124411018592568e-06, | |
"loss": 0.7215, | |
"step": 712 | |
}, | |
{ | |
"epoch": 4.576923076923077, | |
"grad_norm": 0.12029983367038724, | |
"learning_rate": 6.103745618924587e-06, | |
"loss": 0.7142, | |
"step": 714 | |
}, | |
{ | |
"epoch": 4.589743589743589, | |
"grad_norm": 0.10567103079137533, | |
"learning_rate": 6.0830603797132574e-06, | |
"loss": 0.7162, | |
"step": 716 | |
}, | |
{ | |
"epoch": 4.602564102564102, | |
"grad_norm": 0.10836686741052724, | |
"learning_rate": 6.0623556727704306e-06, | |
"loss": 0.7165, | |
"step": 718 | |
}, | |
{ | |
"epoch": 4.615384615384615, | |
"grad_norm": 0.11249604087548312, | |
"learning_rate": 6.041631870257882e-06, | |
"loss": 0.7383, | |
"step": 720 | |
}, | |
{ | |
"epoch": 4.628205128205128, | |
"grad_norm": 0.1082063599668396, | |
"learning_rate": 6.020889344680627e-06, | |
"loss": 0.6952, | |
"step": 722 | |
}, | |
{ | |
"epoch": 4.641025641025641, | |
"grad_norm": 0.10282892185990167, | |
"learning_rate": 6.000128468880223e-06, | |
"loss": 0.7167, | |
"step": 724 | |
}, | |
{ | |
"epoch": 4.653846153846154, | |
"grad_norm": 0.14775806059988206, | |
"learning_rate": 5.979349616028067e-06, | |
"loss": 0.7015, | |
"step": 726 | |
}, | |
{ | |
"epoch": 4.666666666666667, | |
"grad_norm": 0.1146560861251404, | |
"learning_rate": 5.958553159618693e-06, | |
"loss": 0.7213, | |
"step": 728 | |
}, | |
{ | |
"epoch": 4.67948717948718, | |
"grad_norm": 0.10561771243314702, | |
"learning_rate": 5.937739473463047e-06, | |
"loss": 0.7296, | |
"step": 730 | |
}, | |
{ | |
"epoch": 4.6923076923076925, | |
"grad_norm": 0.1030552904773058, | |
"learning_rate": 5.916908931681781e-06, | |
"loss": 0.7123, | |
"step": 732 | |
}, | |
{ | |
"epoch": 4.705128205128205, | |
"grad_norm": 0.11007539115142843, | |
"learning_rate": 5.896061908698521e-06, | |
"loss": 0.7048, | |
"step": 734 | |
}, | |
{ | |
"epoch": 4.717948717948718, | |
"grad_norm": 0.11416376306689043, | |
"learning_rate": 5.8751987792331365e-06, | |
"loss": 0.7137, | |
"step": 736 | |
}, | |
{ | |
"epoch": 4.730769230769231, | |
"grad_norm": 0.10152180107259362, | |
"learning_rate": 5.854319918295012e-06, | |
"loss": 0.7051, | |
"step": 738 | |
}, | |
{ | |
"epoch": 4.743589743589744, | |
"grad_norm": 0.11206883891832514, | |
"learning_rate": 5.833425701176294e-06, | |
"loss": 0.6923, | |
"step": 740 | |
}, | |
{ | |
"epoch": 4.756410256410256, | |
"grad_norm": 0.10804199427828234, | |
"learning_rate": 5.812516503445158e-06, | |
"loss": 0.6955, | |
"step": 742 | |
}, | |
{ | |
"epoch": 4.769230769230769, | |
"grad_norm": 0.10618536471151145, | |
"learning_rate": 5.79159270093905e-06, | |
"loss": 0.7051, | |
"step": 744 | |
}, | |
{ | |
"epoch": 4.782051282051282, | |
"grad_norm": 0.112445946670164, | |
"learning_rate": 5.770654669757935e-06, | |
"loss": 0.6862, | |
"step": 746 | |
}, | |
{ | |
"epoch": 4.794871794871795, | |
"grad_norm": 0.10623939719616725, | |
"learning_rate": 5.749702786257529e-06, | |
"loss": 0.7021, | |
"step": 748 | |
}, | |
{ | |
"epoch": 4.8076923076923075, | |
"grad_norm": 0.11066503537728437, | |
"learning_rate": 5.7287374270425475e-06, | |
"loss": 0.7083, | |
"step": 750 | |
}, | |
{ | |
"epoch": 4.82051282051282, | |
"grad_norm": 0.11956485729401434, | |
"learning_rate": 5.707758968959923e-06, | |
"loss": 0.7052, | |
"step": 752 | |
}, | |
{ | |
"epoch": 4.833333333333333, | |
"grad_norm": 0.11607859173183654, | |
"learning_rate": 5.686767789092041e-06, | |
"loss": 0.7114, | |
"step": 754 | |
}, | |
{ | |
"epoch": 4.846153846153846, | |
"grad_norm": 0.10875829497210732, | |
"learning_rate": 5.6657642647499545e-06, | |
"loss": 0.7159, | |
"step": 756 | |
}, | |
{ | |
"epoch": 4.858974358974359, | |
"grad_norm": 0.10952816111674243, | |
"learning_rate": 5.644748773466606e-06, | |
"loss": 0.7036, | |
"step": 758 | |
}, | |
{ | |
"epoch": 4.871794871794872, | |
"grad_norm": 0.10684780948629531, | |
"learning_rate": 5.62372169299004e-06, | |
"loss": 0.7225, | |
"step": 760 | |
}, | |
{ | |
"epoch": 4.884615384615385, | |
"grad_norm": 0.1047662448976948, | |
"learning_rate": 5.6026834012766155e-06, | |
"loss": 0.6805, | |
"step": 762 | |
}, | |
{ | |
"epoch": 4.897435897435898, | |
"grad_norm": 0.10955003114927836, | |
"learning_rate": 5.581634276484211e-06, | |
"loss": 0.6792, | |
"step": 764 | |
}, | |
{ | |
"epoch": 4.910256410256411, | |
"grad_norm": 0.10878018550941551, | |
"learning_rate": 5.560574696965425e-06, | |
"loss": 0.6921, | |
"step": 766 | |
}, | |
{ | |
"epoch": 4.923076923076923, | |
"grad_norm": 0.11093790171018045, | |
"learning_rate": 5.539505041260779e-06, | |
"loss": 0.6956, | |
"step": 768 | |
}, | |
{ | |
"epoch": 4.935897435897436, | |
"grad_norm": 0.1115655815203421, | |
"learning_rate": 5.518425688091906e-06, | |
"loss": 0.7024, | |
"step": 770 | |
}, | |
{ | |
"epoch": 4.948717948717949, | |
"grad_norm": 0.1131005595068268, | |
"learning_rate": 5.497337016354757e-06, | |
"loss": 0.7148, | |
"step": 772 | |
}, | |
{ | |
"epoch": 4.961538461538462, | |
"grad_norm": 0.11347516336979874, | |
"learning_rate": 5.476239405112775e-06, | |
"loss": 0.6816, | |
"step": 774 | |
}, | |
{ | |
"epoch": 4.9743589743589745, | |
"grad_norm": 0.10898186232548415, | |
"learning_rate": 5.45513323359009e-06, | |
"loss": 0.7273, | |
"step": 776 | |
}, | |
{ | |
"epoch": 4.987179487179487, | |
"grad_norm": 0.11549198646562549, | |
"learning_rate": 5.434018881164702e-06, | |
"loss": 0.6917, | |
"step": 778 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 0.10772346133987304, | |
"learning_rate": 5.412896727361663e-06, | |
"loss": 0.6863, | |
"step": 780 | |
}, | |
{ | |
"epoch": 5.012820512820513, | |
"grad_norm": 0.12047900705924511, | |
"learning_rate": 5.391767151846247e-06, | |
"loss": 0.6282, | |
"step": 782 | |
}, | |
{ | |
"epoch": 5.0256410256410255, | |
"grad_norm": 0.11697315416876589, | |
"learning_rate": 5.370630534417133e-06, | |
"loss": 0.6488, | |
"step": 784 | |
}, | |
{ | |
"epoch": 5.038461538461538, | |
"grad_norm": 0.12319515362238828, | |
"learning_rate": 5.349487254999579e-06, | |
"loss": 0.6356, | |
"step": 786 | |
}, | |
{ | |
"epoch": 5.051282051282051, | |
"grad_norm": 0.13800689390518142, | |
"learning_rate": 5.328337693638591e-06, | |
"loss": 0.6048, | |
"step": 788 | |
}, | |
{ | |
"epoch": 5.064102564102564, | |
"grad_norm": 0.15216649155566264, | |
"learning_rate": 5.307182230492089e-06, | |
"loss": 0.6275, | |
"step": 790 | |
}, | |
{ | |
"epoch": 5.076923076923077, | |
"grad_norm": 0.14128455530647008, | |
"learning_rate": 5.286021245824075e-06, | |
"loss": 0.6255, | |
"step": 792 | |
}, | |
{ | |
"epoch": 5.089743589743589, | |
"grad_norm": 0.11688265140577293, | |
"learning_rate": 5.264855119997803e-06, | |
"loss": 0.6257, | |
"step": 794 | |
}, | |
{ | |
"epoch": 5.102564102564102, | |
"grad_norm": 0.11464621859154618, | |
"learning_rate": 5.243684233468933e-06, | |
"loss": 0.6157, | |
"step": 796 | |
}, | |
{ | |
"epoch": 5.115384615384615, | |
"grad_norm": 0.13981445654349306, | |
"learning_rate": 5.222508966778702e-06, | |
"loss": 0.6508, | |
"step": 798 | |
}, | |
{ | |
"epoch": 5.128205128205128, | |
"grad_norm": 0.14392893164542925, | |
"learning_rate": 5.201329700547077e-06, | |
"loss": 0.6287, | |
"step": 800 | |
}, | |
{ | |
"epoch": 5.141025641025641, | |
"grad_norm": 0.13170699750399084, | |
"learning_rate": 5.180146815465915e-06, | |
"loss": 0.614, | |
"step": 802 | |
}, | |
{ | |
"epoch": 5.153846153846154, | |
"grad_norm": 0.10789843763721454, | |
"learning_rate": 5.158960692292122e-06, | |
"loss": 0.6078, | |
"step": 804 | |
}, | |
{ | |
"epoch": 5.166666666666667, | |
"grad_norm": 0.10695990705186179, | |
"learning_rate": 5.137771711840811e-06, | |
"loss": 0.6034, | |
"step": 806 | |
}, | |
{ | |
"epoch": 5.17948717948718, | |
"grad_norm": 0.11206607562131153, | |
"learning_rate": 5.116580254978447e-06, | |
"loss": 0.5992, | |
"step": 808 | |
}, | |
{ | |
"epoch": 5.1923076923076925, | |
"grad_norm": 0.118691570735362, | |
"learning_rate": 5.095386702616012e-06, | |
"loss": 0.6111, | |
"step": 810 | |
}, | |
{ | |
"epoch": 5.205128205128205, | |
"grad_norm": 0.10922869228321892, | |
"learning_rate": 5.074191435702155e-06, | |
"loss": 0.5879, | |
"step": 812 | |
}, | |
{ | |
"epoch": 5.217948717948718, | |
"grad_norm": 0.11128326921959672, | |
"learning_rate": 5.05299483521634e-06, | |
"loss": 0.6165, | |
"step": 814 | |
}, | |
{ | |
"epoch": 5.230769230769231, | |
"grad_norm": 0.10582354369475495, | |
"learning_rate": 5.031797282162007e-06, | |
"loss": 0.5897, | |
"step": 816 | |
}, | |
{ | |
"epoch": 5.243589743589744, | |
"grad_norm": 0.11451654465777673, | |
"learning_rate": 5.010599157559713e-06, | |
"loss": 0.6062, | |
"step": 818 | |
}, | |
{ | |
"epoch": 5.256410256410256, | |
"grad_norm": 0.10173363821181294, | |
"learning_rate": 4.98940084244029e-06, | |
"loss": 0.6285, | |
"step": 820 | |
}, | |
{ | |
"epoch": 5.269230769230769, | |
"grad_norm": 0.11291157343417497, | |
"learning_rate": 4.968202717837996e-06, | |
"loss": 0.6049, | |
"step": 822 | |
}, | |
{ | |
"epoch": 5.282051282051282, | |
"grad_norm": 0.12019898107948757, | |
"learning_rate": 4.947005164783661e-06, | |
"loss": 0.6208, | |
"step": 824 | |
}, | |
{ | |
"epoch": 5.294871794871795, | |
"grad_norm": 0.114373764807783, | |
"learning_rate": 4.925808564297847e-06, | |
"loss": 0.6043, | |
"step": 826 | |
}, | |
{ | |
"epoch": 5.3076923076923075, | |
"grad_norm": 0.11491095476092386, | |
"learning_rate": 4.9046132973839895e-06, | |
"loss": 0.6519, | |
"step": 828 | |
}, | |
{ | |
"epoch": 5.32051282051282, | |
"grad_norm": 0.11758969246232907, | |
"learning_rate": 4.883419745021554e-06, | |
"loss": 0.6156, | |
"step": 830 | |
}, | |
{ | |
"epoch": 5.333333333333333, | |
"grad_norm": 0.11120312257140173, | |
"learning_rate": 4.862228288159191e-06, | |
"loss": 0.6217, | |
"step": 832 | |
}, | |
{ | |
"epoch": 5.346153846153846, | |
"grad_norm": 0.11752943064735909, | |
"learning_rate": 4.841039307707878e-06, | |
"loss": 0.6234, | |
"step": 834 | |
}, | |
{ | |
"epoch": 5.358974358974359, | |
"grad_norm": 0.11864432138040991, | |
"learning_rate": 4.819853184534085e-06, | |
"loss": 0.5947, | |
"step": 836 | |
}, | |
{ | |
"epoch": 5.371794871794872, | |
"grad_norm": 0.11941298362678118, | |
"learning_rate": 4.798670299452926e-06, | |
"loss": 0.613, | |
"step": 838 | |
}, | |
{ | |
"epoch": 5.384615384615385, | |
"grad_norm": 0.11307708710954777, | |
"learning_rate": 4.7774910332213005e-06, | |
"loss": 0.594, | |
"step": 840 | |
}, | |
{ | |
"epoch": 5.397435897435898, | |
"grad_norm": 0.1093069103681417, | |
"learning_rate": 4.756315766531069e-06, | |
"loss": 0.6049, | |
"step": 842 | |
}, | |
{ | |
"epoch": 5.410256410256411, | |
"grad_norm": 0.11381070200838837, | |
"learning_rate": 4.735144880002199e-06, | |
"loss": 0.6105, | |
"step": 844 | |
}, | |
{ | |
"epoch": 5.423076923076923, | |
"grad_norm": 0.1318249567633448, | |
"learning_rate": 4.713978754175926e-06, | |
"loss": 0.6002, | |
"step": 846 | |
}, | |
{ | |
"epoch": 5.435897435897436, | |
"grad_norm": 0.11599444919219819, | |
"learning_rate": 4.692817769507912e-06, | |
"loss": 0.6042, | |
"step": 848 | |
}, | |
{ | |
"epoch": 5.448717948717949, | |
"grad_norm": 0.11182097821019421, | |
"learning_rate": 4.671662306361409e-06, | |
"loss": 0.6103, | |
"step": 850 | |
}, | |
{ | |
"epoch": 5.461538461538462, | |
"grad_norm": 0.10954799492546995, | |
"learning_rate": 4.6505127450004216e-06, | |
"loss": 0.6016, | |
"step": 852 | |
}, | |
{ | |
"epoch": 5.4743589743589745, | |
"grad_norm": 0.11308176387281334, | |
"learning_rate": 4.62936946558287e-06, | |
"loss": 0.6051, | |
"step": 854 | |
}, | |
{ | |
"epoch": 5.487179487179487, | |
"grad_norm": 0.10759177792306925, | |
"learning_rate": 4.608232848153757e-06, | |
"loss": 0.6134, | |
"step": 856 | |
}, | |
{ | |
"epoch": 5.5, | |
"grad_norm": 0.11581427783989148, | |
"learning_rate": 4.587103272638339e-06, | |
"loss": 0.6119, | |
"step": 858 | |
}, | |
{ | |
"epoch": 5.512820512820513, | |
"grad_norm": 0.11905934839017085, | |
"learning_rate": 4.565981118835299e-06, | |
"loss": 0.5898, | |
"step": 860 | |
}, | |
{ | |
"epoch": 5.5256410256410255, | |
"grad_norm": 0.11350808165481102, | |
"learning_rate": 4.5448667664099125e-06, | |
"loss": 0.5917, | |
"step": 862 | |
}, | |
{ | |
"epoch": 5.538461538461538, | |
"grad_norm": 0.10584763781810944, | |
"learning_rate": 4.523760594887228e-06, | |
"loss": 0.5989, | |
"step": 864 | |
}, | |
{ | |
"epoch": 5.551282051282051, | |
"grad_norm": 0.1052428154285861, | |
"learning_rate": 4.5026629836452445e-06, | |
"loss": 0.5957, | |
"step": 866 | |
}, | |
{ | |
"epoch": 5.564102564102564, | |
"grad_norm": 0.10854637153715369, | |
"learning_rate": 4.481574311908096e-06, | |
"loss": 0.6091, | |
"step": 868 | |
}, | |
{ | |
"epoch": 5.576923076923077, | |
"grad_norm": 0.1114814068673184, | |
"learning_rate": 4.460494958739223e-06, | |
"loss": 0.5982, | |
"step": 870 | |
}, | |
{ | |
"epoch": 5.589743589743589, | |
"grad_norm": 0.10679516983860898, | |
"learning_rate": 4.439425303034576e-06, | |
"loss": 0.5781, | |
"step": 872 | |
}, | |
{ | |
"epoch": 5.602564102564102, | |
"grad_norm": 0.11151279055313694, | |
"learning_rate": 4.418365723515791e-06, | |
"loss": 0.5875, | |
"step": 874 | |
}, | |
{ | |
"epoch": 5.615384615384615, | |
"grad_norm": 0.11196990690511702, | |
"learning_rate": 4.397316598723385e-06, | |
"loss": 0.6389, | |
"step": 876 | |
}, | |
{ | |
"epoch": 5.628205128205128, | |
"grad_norm": 0.11160421611762042, | |
"learning_rate": 4.376278307009962e-06, | |
"loss": 0.5966, | |
"step": 878 | |
}, | |
{ | |
"epoch": 5.641025641025641, | |
"grad_norm": 0.11892010523420538, | |
"learning_rate": 4.355251226533396e-06, | |
"loss": 0.5967, | |
"step": 880 | |
}, | |
{ | |
"epoch": 5.653846153846154, | |
"grad_norm": 0.10528377897572419, | |
"learning_rate": 4.334235735250047e-06, | |
"loss": 0.5872, | |
"step": 882 | |
}, | |
{ | |
"epoch": 5.666666666666667, | |
"grad_norm": 0.11078816330375864, | |
"learning_rate": 4.313232210907959e-06, | |
"loss": 0.5839, | |
"step": 884 | |
}, | |
{ | |
"epoch": 5.67948717948718, | |
"grad_norm": 0.11204326662361994, | |
"learning_rate": 4.292241031040077e-06, | |
"loss": 0.5985, | |
"step": 886 | |
}, | |
{ | |
"epoch": 5.6923076923076925, | |
"grad_norm": 0.11136880270558686, | |
"learning_rate": 4.271262572957453e-06, | |
"loss": 0.5899, | |
"step": 888 | |
}, | |
{ | |
"epoch": 5.705128205128205, | |
"grad_norm": 0.11504012535772838, | |
"learning_rate": 4.250297213742473e-06, | |
"loss": 0.5861, | |
"step": 890 | |
}, | |
{ | |
"epoch": 5.717948717948718, | |
"grad_norm": 0.11477930854527991, | |
"learning_rate": 4.229345330242067e-06, | |
"loss": 0.603, | |
"step": 892 | |
}, | |
{ | |
"epoch": 5.730769230769231, | |
"grad_norm": 0.10662143707078175, | |
"learning_rate": 4.2084072990609505e-06, | |
"loss": 0.6222, | |
"step": 894 | |
}, | |
{ | |
"epoch": 5.743589743589744, | |
"grad_norm": 0.11014567262417806, | |
"learning_rate": 4.187483496554844e-06, | |
"loss": 0.607, | |
"step": 896 | |
}, | |
{ | |
"epoch": 5.756410256410256, | |
"grad_norm": 0.11111661489395548, | |
"learning_rate": 4.166574298823707e-06, | |
"loss": 0.5932, | |
"step": 898 | |
}, | |
{ | |
"epoch": 5.769230769230769, | |
"grad_norm": 0.11242007034134975, | |
"learning_rate": 4.145680081704989e-06, | |
"loss": 0.5995, | |
"step": 900 | |
}, | |
{ | |
"epoch": 5.782051282051282, | |
"grad_norm": 0.11419724226540427, | |
"learning_rate": 4.1248012207668635e-06, | |
"loss": 0.5915, | |
"step": 902 | |
}, | |
{ | |
"epoch": 5.794871794871795, | |
"grad_norm": 0.11706613959593416, | |
"learning_rate": 4.103938091301479e-06, | |
"loss": 0.618, | |
"step": 904 | |
}, | |
{ | |
"epoch": 5.8076923076923075, | |
"grad_norm": 0.11209016426196475, | |
"learning_rate": 4.08309106831822e-06, | |
"loss": 0.6047, | |
"step": 906 | |
}, | |
{ | |
"epoch": 5.82051282051282, | |
"grad_norm": 0.10918746883833244, | |
"learning_rate": 4.062260526536955e-06, | |
"loss": 0.5993, | |
"step": 908 | |
}, | |
{ | |
"epoch": 5.833333333333333, | |
"grad_norm": 0.11265204962035352, | |
"learning_rate": 4.041446840381309e-06, | |
"loss": 0.6107, | |
"step": 910 | |
}, | |
{ | |
"epoch": 5.846153846153846, | |
"grad_norm": 0.11006154535411454, | |
"learning_rate": 4.0206503839719335e-06, | |
"loss": 0.5968, | |
"step": 912 | |
}, | |
{ | |
"epoch": 5.858974358974359, | |
"grad_norm": 0.10704556131214127, | |
"learning_rate": 3.999871531119779e-06, | |
"loss": 0.6004, | |
"step": 914 | |
}, | |
{ | |
"epoch": 5.871794871794872, | |
"grad_norm": 0.11890492535370141, | |
"learning_rate": 3.9791106553193746e-06, | |
"loss": 0.6235, | |
"step": 916 | |
}, | |
{ | |
"epoch": 5.884615384615385, | |
"grad_norm": 0.11125280463928439, | |
"learning_rate": 3.9583681297421194e-06, | |
"loss": 0.5936, | |
"step": 918 | |
}, | |
{ | |
"epoch": 5.897435897435898, | |
"grad_norm": 0.11331294271299998, | |
"learning_rate": 3.937644327229572e-06, | |
"loss": 0.5935, | |
"step": 920 | |
}, | |
{ | |
"epoch": 5.910256410256411, | |
"grad_norm": 0.121603183912255, | |
"learning_rate": 3.916939620286743e-06, | |
"loss": 0.5784, | |
"step": 922 | |
}, | |
{ | |
"epoch": 5.923076923076923, | |
"grad_norm": 0.11031017606070566, | |
"learning_rate": 3.896254381075416e-06, | |
"loss": 0.572, | |
"step": 924 | |
}, | |
{ | |
"epoch": 5.935897435897436, | |
"grad_norm": 0.11514051578131597, | |
"learning_rate": 3.875588981407433e-06, | |
"loss": 0.6112, | |
"step": 926 | |
}, | |
{ | |
"epoch": 5.948717948717949, | |
"grad_norm": 0.10743392753449098, | |
"learning_rate": 3.854943792738037e-06, | |
"loss": 0.606, | |
"step": 928 | |
}, | |
{ | |
"epoch": 5.961538461538462, | |
"grad_norm": 0.11236268397619992, | |
"learning_rate": 3.834319186159179e-06, | |
"loss": 0.5922, | |
"step": 930 | |
}, | |
{ | |
"epoch": 5.9743589743589745, | |
"grad_norm": 0.12074864124016804, | |
"learning_rate": 3.8137155323928526e-06, | |
"loss": 0.5949, | |
"step": 932 | |
}, | |
{ | |
"epoch": 5.987179487179487, | |
"grad_norm": 0.10827890126242279, | |
"learning_rate": 3.7931332017844302e-06, | |
"loss": 0.6203, | |
"step": 934 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 0.10969453481755433, | |
"learning_rate": 3.7725725642960047e-06, | |
"loss": 0.5635, | |
"step": 936 | |
}, | |
{ | |
"epoch": 6.012820512820513, | |
"grad_norm": 0.1031061811953427, | |
"learning_rate": 3.752033989499742e-06, | |
"loss": 0.5165, | |
"step": 938 | |
}, | |
{ | |
"epoch": 6.0256410256410255, | |
"grad_norm": 0.1194475307958204, | |
"learning_rate": 3.7315178465712364e-06, | |
"loss": 0.5365, | |
"step": 940 | |
}, | |
{ | |
"epoch": 6.038461538461538, | |
"grad_norm": 0.12267686021895448, | |
"learning_rate": 3.7110245042828786e-06, | |
"loss": 0.557, | |
"step": 942 | |
}, | |
{ | |
"epoch": 6.051282051282051, | |
"grad_norm": 0.13498002529028316, | |
"learning_rate": 3.690554330997215e-06, | |
"loss": 0.5647, | |
"step": 944 | |
}, | |
{ | |
"epoch": 6.064102564102564, | |
"grad_norm": 0.11313547950477416, | |
"learning_rate": 3.670107694660343e-06, | |
"loss": 0.5319, | |
"step": 946 | |
}, | |
{ | |
"epoch": 6.076923076923077, | |
"grad_norm": 0.11337535375238234, | |
"learning_rate": 3.6496849627952875e-06, | |
"loss": 0.5337, | |
"step": 948 | |
}, | |
{ | |
"epoch": 6.089743589743589, | |
"grad_norm": 0.12108662571350122, | |
"learning_rate": 3.6292865024953945e-06, | |
"loss": 0.5317, | |
"step": 950 | |
}, | |
{ | |
"epoch": 6.102564102564102, | |
"grad_norm": 0.10860898068984597, | |
"learning_rate": 3.6089126804177373e-06, | |
"loss": 0.5144, | |
"step": 952 | |
}, | |
{ | |
"epoch": 6.115384615384615, | |
"grad_norm": 0.10618003581452222, | |
"learning_rate": 3.5885638627765228e-06, | |
"loss": 0.5297, | |
"step": 954 | |
}, | |
{ | |
"epoch": 6.128205128205128, | |
"grad_norm": 0.10830357191265076, | |
"learning_rate": 3.568240415336509e-06, | |
"loss": 0.5153, | |
"step": 956 | |
}, | |
{ | |
"epoch": 6.141025641025641, | |
"grad_norm": 0.11656722880396314, | |
"learning_rate": 3.547942703406433e-06, | |
"loss": 0.5689, | |
"step": 958 | |
}, | |
{ | |
"epoch": 6.153846153846154, | |
"grad_norm": 0.12409515119125744, | |
"learning_rate": 3.52767109183244e-06, | |
"loss": 0.5339, | |
"step": 960 | |
}, | |
{ | |
"epoch": 6.166666666666667, | |
"grad_norm": 0.10533677158970278, | |
"learning_rate": 3.507425944991529e-06, | |
"loss": 0.5298, | |
"step": 962 | |
}, | |
{ | |
"epoch": 6.17948717948718, | |
"grad_norm": 0.10779883016864458, | |
"learning_rate": 3.4872076267850015e-06, | |
"loss": 0.5393, | |
"step": 964 | |
}, | |
{ | |
"epoch": 6.1923076923076925, | |
"grad_norm": 0.10689267808425468, | |
"learning_rate": 3.4670165006319236e-06, | |
"loss": 0.5385, | |
"step": 966 | |
}, | |
{ | |
"epoch": 6.205128205128205, | |
"grad_norm": 0.10262511919512685, | |
"learning_rate": 3.4468529294625895e-06, | |
"loss": 0.5415, | |
"step": 968 | |
}, | |
{ | |
"epoch": 6.217948717948718, | |
"grad_norm": 0.1057687129463025, | |
"learning_rate": 3.4267172757120005e-06, | |
"loss": 0.5424, | |
"step": 970 | |
}, | |
{ | |
"epoch": 6.230769230769231, | |
"grad_norm": 0.10378572914010763, | |
"learning_rate": 3.406609901313349e-06, | |
"loss": 0.5112, | |
"step": 972 | |
}, | |
{ | |
"epoch": 6.243589743589744, | |
"grad_norm": 0.12203819824027057, | |
"learning_rate": 3.386531167691512e-06, | |
"loss": 0.5384, | |
"step": 974 | |
}, | |
{ | |
"epoch": 6.256410256410256, | |
"grad_norm": 0.10802288722224572, | |
"learning_rate": 3.36648143575656e-06, | |
"loss": 0.5028, | |
"step": 976 | |
}, | |
{ | |
"epoch": 6.269230769230769, | |
"grad_norm": 0.1142664875757734, | |
"learning_rate": 3.3464610658972584e-06, | |
"loss": 0.5292, | |
"step": 978 | |
}, | |
{ | |
"epoch": 6.282051282051282, | |
"grad_norm": 0.10568144050459893, | |
"learning_rate": 3.326470417974604e-06, | |
"loss": 0.5105, | |
"step": 980 | |
}, | |
{ | |
"epoch": 6.294871794871795, | |
"grad_norm": 0.10752642025433035, | |
"learning_rate": 3.3065098513153473e-06, | |
"loss": 0.543, | |
"step": 982 | |
}, | |
{ | |
"epoch": 6.3076923076923075, | |
"grad_norm": 0.10979310993514947, | |
"learning_rate": 3.2865797247055354e-06, | |
"loss": 0.5277, | |
"step": 984 | |
}, | |
{ | |
"epoch": 6.32051282051282, | |
"grad_norm": 0.10254738368035236, | |
"learning_rate": 3.266680396384061e-06, | |
"loss": 0.5159, | |
"step": 986 | |
}, | |
{ | |
"epoch": 6.333333333333333, | |
"grad_norm": 0.10564094033375905, | |
"learning_rate": 3.2468122240362287e-06, | |
"loss": 0.5314, | |
"step": 988 | |
}, | |
{ | |
"epoch": 6.346153846153846, | |
"grad_norm": 0.11424837464444032, | |
"learning_rate": 3.226975564787322e-06, | |
"loss": 0.5584, | |
"step": 990 | |
}, | |
{ | |
"epoch": 6.358974358974359, | |
"grad_norm": 0.10635885003440321, | |
"learning_rate": 3.2071707751961838e-06, | |
"loss": 0.542, | |
"step": 992 | |
}, | |
{ | |
"epoch": 6.371794871794872, | |
"grad_norm": 0.11342627001310603, | |
"learning_rate": 3.187398211248811e-06, | |
"loss": 0.526, | |
"step": 994 | |
}, | |
{ | |
"epoch": 6.384615384615385, | |
"grad_norm": 0.10721179464672517, | |
"learning_rate": 3.1676582283519454e-06, | |
"loss": 0.5161, | |
"step": 996 | |
}, | |
{ | |
"epoch": 6.397435897435898, | |
"grad_norm": 0.10847381371938812, | |
"learning_rate": 3.1479511813267006e-06, | |
"loss": 0.5308, | |
"step": 998 | |
}, | |
{ | |
"epoch": 6.410256410256411, | |
"grad_norm": 0.10553871633956305, | |
"learning_rate": 3.1282774244021717e-06, | |
"loss": 0.5389, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 6.423076923076923, | |
"grad_norm": 0.11217705883560118, | |
"learning_rate": 3.1086373112090762e-06, | |
"loss": 0.5157, | |
"step": 1002 | |
}, | |
{ | |
"epoch": 6.435897435897436, | |
"grad_norm": 0.11158771311440012, | |
"learning_rate": 3.089031194773392e-06, | |
"loss": 0.5053, | |
"step": 1004 | |
}, | |
{ | |
"epoch": 6.448717948717949, | |
"grad_norm": 0.1089332843436129, | |
"learning_rate": 3.069459427510014e-06, | |
"loss": 0.5347, | |
"step": 1006 | |
}, | |
{ | |
"epoch": 6.461538461538462, | |
"grad_norm": 0.11195737620435117, | |
"learning_rate": 3.049922361216422e-06, | |
"loss": 0.5114, | |
"step": 1008 | |
}, | |
{ | |
"epoch": 6.4743589743589745, | |
"grad_norm": 0.10286473273779988, | |
"learning_rate": 3.0304203470663507e-06, | |
"loss": 0.5234, | |
"step": 1010 | |
}, | |
{ | |
"epoch": 6.487179487179487, | |
"grad_norm": 0.1097353874290811, | |
"learning_rate": 3.0109537356034856e-06, | |
"loss": 0.5149, | |
"step": 1012 | |
}, | |
{ | |
"epoch": 6.5, | |
"grad_norm": 0.10411123643651161, | |
"learning_rate": 2.991522876735154e-06, | |
"loss": 0.5548, | |
"step": 1014 | |
}, | |
{ | |
"epoch": 6.512820512820513, | |
"grad_norm": 0.11334395185933709, | |
"learning_rate": 2.9721281197260427e-06, | |
"loss": 0.51, | |
"step": 1016 | |
}, | |
{ | |
"epoch": 6.5256410256410255, | |
"grad_norm": 0.10500841046128222, | |
"learning_rate": 2.9527698131919156e-06, | |
"loss": 0.5139, | |
"step": 1018 | |
}, | |
{ | |
"epoch": 6.538461538461538, | |
"grad_norm": 0.11046543252263778, | |
"learning_rate": 2.9334483050933506e-06, | |
"loss": 0.5078, | |
"step": 1020 | |
}, | |
{ | |
"epoch": 6.551282051282051, | |
"grad_norm": 0.10051525374525226, | |
"learning_rate": 2.91416394272948e-06, | |
"loss": 0.5231, | |
"step": 1022 | |
}, | |
{ | |
"epoch": 6.564102564102564, | |
"grad_norm": 0.11091867450033485, | |
"learning_rate": 2.894917072731753e-06, | |
"loss": 0.5248, | |
"step": 1024 | |
}, | |
{ | |
"epoch": 6.576923076923077, | |
"grad_norm": 0.11548978463749487, | |
"learning_rate": 2.8757080410577042e-06, | |
"loss": 0.5331, | |
"step": 1026 | |
}, | |
{ | |
"epoch": 6.589743589743589, | |
"grad_norm": 0.10458731304307277, | |
"learning_rate": 2.8565371929847286e-06, | |
"loss": 0.5107, | |
"step": 1028 | |
}, | |
{ | |
"epoch": 6.602564102564102, | |
"grad_norm": 0.1050325378040027, | |
"learning_rate": 2.83740487310389e-06, | |
"loss": 0.5477, | |
"step": 1030 | |
}, | |
{ | |
"epoch": 6.615384615384615, | |
"grad_norm": 0.10848913413728074, | |
"learning_rate": 2.81831142531371e-06, | |
"loss": 0.5268, | |
"step": 1032 | |
}, | |
{ | |
"epoch": 6.628205128205128, | |
"grad_norm": 0.10721120045068884, | |
"learning_rate": 2.7992571928139984e-06, | |
"loss": 0.5433, | |
"step": 1034 | |
}, | |
{ | |
"epoch": 6.641025641025641, | |
"grad_norm": 0.11369155962674747, | |
"learning_rate": 2.780242518099675e-06, | |
"loss": 0.5359, | |
"step": 1036 | |
}, | |
{ | |
"epoch": 6.653846153846154, | |
"grad_norm": 0.10077730135750612, | |
"learning_rate": 2.761267742954629e-06, | |
"loss": 0.5283, | |
"step": 1038 | |
}, | |
{ | |
"epoch": 6.666666666666667, | |
"grad_norm": 0.10299831989759317, | |
"learning_rate": 2.7423332084455543e-06, | |
"loss": 0.5012, | |
"step": 1040 | |
}, | |
{ | |
"epoch": 6.67948717948718, | |
"grad_norm": 0.11366647060441272, | |
"learning_rate": 2.723439254915834e-06, | |
"loss": 0.5431, | |
"step": 1042 | |
}, | |
{ | |
"epoch": 6.6923076923076925, | |
"grad_norm": 0.10337571007144457, | |
"learning_rate": 2.704586221979422e-06, | |
"loss": 0.5218, | |
"step": 1044 | |
}, | |
{ | |
"epoch": 6.705128205128205, | |
"grad_norm": 0.10480130947183795, | |
"learning_rate": 2.6857744485147286e-06, | |
"loss": 0.5275, | |
"step": 1046 | |
}, | |
{ | |
"epoch": 6.717948717948718, | |
"grad_norm": 0.10841475505701116, | |
"learning_rate": 2.667004272658541e-06, | |
"loss": 0.5239, | |
"step": 1048 | |
}, | |
{ | |
"epoch": 6.730769230769231, | |
"grad_norm": 0.10494980003698792, | |
"learning_rate": 2.6482760317999338e-06, | |
"loss": 0.5307, | |
"step": 1050 | |
}, | |
{ | |
"epoch": 6.743589743589744, | |
"grad_norm": 0.11015308961784986, | |
"learning_rate": 2.629590062574221e-06, | |
"loss": 0.5188, | |
"step": 1052 | |
}, | |
{ | |
"epoch": 6.756410256410256, | |
"grad_norm": 0.10650485718812779, | |
"learning_rate": 2.610946700856885e-06, | |
"loss": 0.5289, | |
"step": 1054 | |
}, | |
{ | |
"epoch": 6.769230769230769, | |
"grad_norm": 0.11042204753698427, | |
"learning_rate": 2.592346281757552e-06, | |
"loss": 0.5256, | |
"step": 1056 | |
}, | |
{ | |
"epoch": 6.782051282051282, | |
"grad_norm": 0.1109413966927775, | |
"learning_rate": 2.5737891396139713e-06, | |
"loss": 0.5113, | |
"step": 1058 | |
}, | |
{ | |
"epoch": 6.794871794871795, | |
"grad_norm": 0.0995906902958352, | |
"learning_rate": 2.5552756079859904e-06, | |
"loss": 0.5286, | |
"step": 1060 | |
}, | |
{ | |
"epoch": 6.8076923076923075, | |
"grad_norm": 0.10719822108936067, | |
"learning_rate": 2.5368060196495785e-06, | |
"loss": 0.5346, | |
"step": 1062 | |
}, | |
{ | |
"epoch": 6.82051282051282, | |
"grad_norm": 0.10359465789573513, | |
"learning_rate": 2.5183807065908296e-06, | |
"loss": 0.5345, | |
"step": 1064 | |
}, | |
{ | |
"epoch": 6.833333333333333, | |
"grad_norm": 0.10868896685434068, | |
"learning_rate": 2.5000000000000015e-06, | |
"loss": 0.5374, | |
"step": 1066 | |
}, | |
{ | |
"epoch": 6.846153846153846, | |
"grad_norm": 0.09898844010754565, | |
"learning_rate": 2.4816642302655634e-06, | |
"loss": 0.525, | |
"step": 1068 | |
}, | |
{ | |
"epoch": 6.858974358974359, | |
"grad_norm": 0.10379473426548341, | |
"learning_rate": 2.4633737269682546e-06, | |
"loss": 0.5016, | |
"step": 1070 | |
}, | |
{ | |
"epoch": 6.871794871794872, | |
"grad_norm": 0.10543135697876233, | |
"learning_rate": 2.445128818875166e-06, | |
"loss": 0.5113, | |
"step": 1072 | |
}, | |
{ | |
"epoch": 6.884615384615385, | |
"grad_norm": 0.7212115719506329, | |
"learning_rate": 2.4269298339338205e-06, | |
"loss": 0.5296, | |
"step": 1074 | |
}, | |
{ | |
"epoch": 6.897435897435898, | |
"grad_norm": 0.10405167616760767, | |
"learning_rate": 2.408777099266291e-06, | |
"loss": 0.5305, | |
"step": 1076 | |
}, | |
{ | |
"epoch": 6.910256410256411, | |
"grad_norm": 0.1105828209294347, | |
"learning_rate": 2.3906709411633073e-06, | |
"loss": 0.5249, | |
"step": 1078 | |
}, | |
{ | |
"epoch": 6.923076923076923, | |
"grad_norm": 0.09948650821267507, | |
"learning_rate": 2.3726116850783987e-06, | |
"loss": 0.5053, | |
"step": 1080 | |
}, | |
{ | |
"epoch": 6.935897435897436, | |
"grad_norm": 0.10462584074627808, | |
"learning_rate": 2.354599655622049e-06, | |
"loss": 0.5355, | |
"step": 1082 | |
}, | |
{ | |
"epoch": 6.948717948717949, | |
"grad_norm": 0.10815781379459323, | |
"learning_rate": 2.3366351765558437e-06, | |
"loss": 0.518, | |
"step": 1084 | |
}, | |
{ | |
"epoch": 6.961538461538462, | |
"grad_norm": 0.10164985194268733, | |
"learning_rate": 2.318718570786675e-06, | |
"loss": 0.5164, | |
"step": 1086 | |
}, | |
{ | |
"epoch": 6.9743589743589745, | |
"grad_norm": 0.10415162261749741, | |
"learning_rate": 2.3008501603609147e-06, | |
"loss": 0.5027, | |
"step": 1088 | |
}, | |
{ | |
"epoch": 6.987179487179487, | |
"grad_norm": 0.10190530521809464, | |
"learning_rate": 2.283030266458644e-06, | |
"loss": 0.5015, | |
"step": 1090 | |
}, | |
{ | |
"epoch": 7.0, | |
"grad_norm": 0.10253347317045373, | |
"learning_rate": 2.265259209387867e-06, | |
"loss": 0.5162, | |
"step": 1092 | |
} | |
], | |
"logging_steps": 2, | |
"max_steps": 1560, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 4.4209696878656225e+19, | |
"train_batch_size": 2, | |
"trial_name": null, | |
"trial_params": null | |
} | |