MiniLMv2-L6-H384_R-OCR-quality / trainer_state.json
pszemraj's picture
End of training
ee1a1f1 verified
raw
history blame
No virus
43 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9984038308060654,
"eval_steps": 250,
"global_step": 1878,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010641127959563713,
"grad_norm": 0.8616393804550171,
"learning_rate": 3.1914893617021277e-06,
"loss": 0.6847,
"num_input_tokens_seen": 327680,
"step": 10
},
{
"epoch": 0.021282255919127427,
"grad_norm": 1.3775863647460938,
"learning_rate": 6.3829787234042555e-06,
"loss": 0.6579,
"num_input_tokens_seen": 655360,
"step": 20
},
{
"epoch": 0.03192338387869114,
"grad_norm": 2.395984411239624,
"learning_rate": 9.574468085106385e-06,
"loss": 0.6053,
"num_input_tokens_seen": 983040,
"step": 30
},
{
"epoch": 0.042564511838254854,
"grad_norm": 1.8644745349884033,
"learning_rate": 1.2765957446808511e-05,
"loss": 0.53,
"num_input_tokens_seen": 1310720,
"step": 40
},
{
"epoch": 0.05320563979781857,
"grad_norm": 2.1690289974212646,
"learning_rate": 1.5957446808510637e-05,
"loss": 0.4419,
"num_input_tokens_seen": 1638400,
"step": 50
},
{
"epoch": 0.06384676775738228,
"grad_norm": 1.3926266431808472,
"learning_rate": 1.914893617021277e-05,
"loss": 0.3329,
"num_input_tokens_seen": 1966080,
"step": 60
},
{
"epoch": 0.074487895716946,
"grad_norm": 1.0763431787490845,
"learning_rate": 2.2340425531914894e-05,
"loss": 0.2703,
"num_input_tokens_seen": 2293760,
"step": 70
},
{
"epoch": 0.08512902367650971,
"grad_norm": 12.503619194030762,
"learning_rate": 2.5531914893617022e-05,
"loss": 0.1906,
"num_input_tokens_seen": 2621440,
"step": 80
},
{
"epoch": 0.09577015163607343,
"grad_norm": 0.6052917838096619,
"learning_rate": 2.872340425531915e-05,
"loss": 0.1476,
"num_input_tokens_seen": 2949120,
"step": 90
},
{
"epoch": 0.10641127959563713,
"grad_norm": 5.584522247314453,
"learning_rate": 2.9899103139013456e-05,
"loss": 0.1279,
"num_input_tokens_seen": 3276800,
"step": 100
},
{
"epoch": 0.11705240755520085,
"grad_norm": 1.0587092638015747,
"learning_rate": 2.9730941704035875e-05,
"loss": 0.112,
"num_input_tokens_seen": 3604480,
"step": 110
},
{
"epoch": 0.12769353551476456,
"grad_norm": 2.5089759826660156,
"learning_rate": 2.9562780269058297e-05,
"loss": 0.1119,
"num_input_tokens_seen": 3932160,
"step": 120
},
{
"epoch": 0.13833466347432827,
"grad_norm": 4.025810241699219,
"learning_rate": 2.939461883408072e-05,
"loss": 0.1155,
"num_input_tokens_seen": 4259840,
"step": 130
},
{
"epoch": 0.148975791433892,
"grad_norm": 0.6721552014350891,
"learning_rate": 2.922645739910314e-05,
"loss": 0.0937,
"num_input_tokens_seen": 4587520,
"step": 140
},
{
"epoch": 0.1596169193934557,
"grad_norm": 4.8363542556762695,
"learning_rate": 2.905829596412556e-05,
"loss": 0.089,
"num_input_tokens_seen": 4915200,
"step": 150
},
{
"epoch": 0.17025804735301941,
"grad_norm": 13.355521202087402,
"learning_rate": 2.889013452914798e-05,
"loss": 0.0525,
"num_input_tokens_seen": 5242880,
"step": 160
},
{
"epoch": 0.18089917531258312,
"grad_norm": 17.72276496887207,
"learning_rate": 2.8721973094170402e-05,
"loss": 0.0699,
"num_input_tokens_seen": 5570560,
"step": 170
},
{
"epoch": 0.19154030327214686,
"grad_norm": 3.537041187286377,
"learning_rate": 2.8553811659192828e-05,
"loss": 0.0811,
"num_input_tokens_seen": 5898240,
"step": 180
},
{
"epoch": 0.20218143123171056,
"grad_norm": 0.13461732864379883,
"learning_rate": 2.8385650224215247e-05,
"loss": 0.0763,
"num_input_tokens_seen": 6225920,
"step": 190
},
{
"epoch": 0.21282255919127427,
"grad_norm": 9.155119895935059,
"learning_rate": 2.821748878923767e-05,
"loss": 0.1048,
"num_input_tokens_seen": 6553600,
"step": 200
},
{
"epoch": 0.22346368715083798,
"grad_norm": 0.7209023833274841,
"learning_rate": 2.804932735426009e-05,
"loss": 0.1231,
"num_input_tokens_seen": 6881280,
"step": 210
},
{
"epoch": 0.2341048151104017,
"grad_norm": 0.5195837020874023,
"learning_rate": 2.788116591928251e-05,
"loss": 0.0537,
"num_input_tokens_seen": 7208960,
"step": 220
},
{
"epoch": 0.24474594306996542,
"grad_norm": 3.8807427883148193,
"learning_rate": 2.7713004484304933e-05,
"loss": 0.0579,
"num_input_tokens_seen": 7536640,
"step": 230
},
{
"epoch": 0.2553870710295291,
"grad_norm": 3.4100818634033203,
"learning_rate": 2.7544843049327355e-05,
"loss": 0.062,
"num_input_tokens_seen": 7864320,
"step": 240
},
{
"epoch": 0.26602819898909286,
"grad_norm": 0.3366034924983978,
"learning_rate": 2.7376681614349774e-05,
"loss": 0.0298,
"num_input_tokens_seen": 8192000,
"step": 250
},
{
"epoch": 0.26602819898909286,
"eval_accuracy": 0.99,
"eval_loss": 0.044801026582717896,
"eval_runtime": 1.1309,
"eval_samples_per_second": 442.123,
"eval_steps_per_second": 55.707,
"num_input_tokens_seen": 8192000,
"step": 250
},
{
"epoch": 0.27666932694865654,
"grad_norm": 1.2192944288253784,
"learning_rate": 2.72085201793722e-05,
"loss": 0.0562,
"num_input_tokens_seen": 8519680,
"step": 260
},
{
"epoch": 0.28731045490822027,
"grad_norm": 0.7389326691627502,
"learning_rate": 2.7040358744394622e-05,
"loss": 0.0435,
"num_input_tokens_seen": 8847360,
"step": 270
},
{
"epoch": 0.297951582867784,
"grad_norm": 1.691129446029663,
"learning_rate": 2.687219730941704e-05,
"loss": 0.0256,
"num_input_tokens_seen": 9175040,
"step": 280
},
{
"epoch": 0.3085927108273477,
"grad_norm": 0.20158784091472626,
"learning_rate": 2.6704035874439464e-05,
"loss": 0.08,
"num_input_tokens_seen": 9502720,
"step": 290
},
{
"epoch": 0.3192338387869114,
"grad_norm": 0.4045298099517822,
"learning_rate": 2.6535874439461886e-05,
"loss": 0.0174,
"num_input_tokens_seen": 9830400,
"step": 300
},
{
"epoch": 0.32987496674647515,
"grad_norm": 5.865575313568115,
"learning_rate": 2.6367713004484305e-05,
"loss": 0.0701,
"num_input_tokens_seen": 10158080,
"step": 310
},
{
"epoch": 0.34051609470603883,
"grad_norm": 12.122817993164062,
"learning_rate": 2.6199551569506727e-05,
"loss": 0.1398,
"num_input_tokens_seen": 10485760,
"step": 320
},
{
"epoch": 0.35115722266560256,
"grad_norm": 0.43689683079719543,
"learning_rate": 2.6031390134529146e-05,
"loss": 0.0645,
"num_input_tokens_seen": 10813440,
"step": 330
},
{
"epoch": 0.36179835062516624,
"grad_norm": 0.1345166265964508,
"learning_rate": 2.586322869955157e-05,
"loss": 0.0394,
"num_input_tokens_seen": 11141120,
"step": 340
},
{
"epoch": 0.37243947858473,
"grad_norm": 0.5597580075263977,
"learning_rate": 2.5695067264573994e-05,
"loss": 0.0534,
"num_input_tokens_seen": 11468800,
"step": 350
},
{
"epoch": 0.3830806065442937,
"grad_norm": 1.6686193943023682,
"learning_rate": 2.5526905829596413e-05,
"loss": 0.0499,
"num_input_tokens_seen": 11796480,
"step": 360
},
{
"epoch": 0.3937217345038574,
"grad_norm": 0.08618992567062378,
"learning_rate": 2.5358744394618835e-05,
"loss": 0.0312,
"num_input_tokens_seen": 12124160,
"step": 370
},
{
"epoch": 0.4043628624634211,
"grad_norm": 0.07978615164756775,
"learning_rate": 2.5190582959641258e-05,
"loss": 0.0488,
"num_input_tokens_seen": 12451840,
"step": 380
},
{
"epoch": 0.41500399042298486,
"grad_norm": 2.9216437339782715,
"learning_rate": 2.5022421524663677e-05,
"loss": 0.0281,
"num_input_tokens_seen": 12779520,
"step": 390
},
{
"epoch": 0.42564511838254854,
"grad_norm": 2.1254470348358154,
"learning_rate": 2.48542600896861e-05,
"loss": 0.044,
"num_input_tokens_seen": 13107200,
"step": 400
},
{
"epoch": 0.43628624634211227,
"grad_norm": 0.1027815118432045,
"learning_rate": 2.468609865470852e-05,
"loss": 0.0278,
"num_input_tokens_seen": 13434880,
"step": 410
},
{
"epoch": 0.44692737430167595,
"grad_norm": 0.15135648846626282,
"learning_rate": 2.451793721973094e-05,
"loss": 0.0448,
"num_input_tokens_seen": 13762560,
"step": 420
},
{
"epoch": 0.4575685022612397,
"grad_norm": 0.09930180758237839,
"learning_rate": 2.4349775784753363e-05,
"loss": 0.0294,
"num_input_tokens_seen": 14090240,
"step": 430
},
{
"epoch": 0.4682096302208034,
"grad_norm": 0.37529394030570984,
"learning_rate": 2.4181614349775788e-05,
"loss": 0.0437,
"num_input_tokens_seen": 14417920,
"step": 440
},
{
"epoch": 0.4788507581803671,
"grad_norm": 0.0906977429986,
"learning_rate": 2.4013452914798207e-05,
"loss": 0.0276,
"num_input_tokens_seen": 14745600,
"step": 450
},
{
"epoch": 0.48949188613993083,
"grad_norm": 2.0479931831359863,
"learning_rate": 2.384529147982063e-05,
"loss": 0.0638,
"num_input_tokens_seen": 15073280,
"step": 460
},
{
"epoch": 0.5001330140994945,
"grad_norm": 0.427298903465271,
"learning_rate": 2.367713004484305e-05,
"loss": 0.0333,
"num_input_tokens_seen": 15400960,
"step": 470
},
{
"epoch": 0.5107741420590582,
"grad_norm": 0.6889400482177734,
"learning_rate": 2.350896860986547e-05,
"loss": 0.0225,
"num_input_tokens_seen": 15728640,
"step": 480
},
{
"epoch": 0.521415270018622,
"grad_norm": 0.06804540008306503,
"learning_rate": 2.3340807174887893e-05,
"loss": 0.0285,
"num_input_tokens_seen": 16056320,
"step": 490
},
{
"epoch": 0.5320563979781857,
"grad_norm": 0.20838595926761627,
"learning_rate": 2.3172645739910312e-05,
"loss": 0.0141,
"num_input_tokens_seen": 16384000,
"step": 500
},
{
"epoch": 0.5320563979781857,
"eval_accuracy": 0.99,
"eval_loss": 0.033007875084877014,
"eval_runtime": 1.1242,
"eval_samples_per_second": 444.771,
"eval_steps_per_second": 56.041,
"num_input_tokens_seen": 16384000,
"step": 500
},
{
"epoch": 0.5426975259377494,
"grad_norm": 0.09140049666166306,
"learning_rate": 2.3004484304932734e-05,
"loss": 0.019,
"num_input_tokens_seen": 16711680,
"step": 510
},
{
"epoch": 0.5533386538973131,
"grad_norm": 0.06261716037988663,
"learning_rate": 2.283632286995516e-05,
"loss": 0.0355,
"num_input_tokens_seen": 17039360,
"step": 520
},
{
"epoch": 0.5639797818568768,
"grad_norm": 2.4450674057006836,
"learning_rate": 2.266816143497758e-05,
"loss": 0.031,
"num_input_tokens_seen": 17367040,
"step": 530
},
{
"epoch": 0.5746209098164405,
"grad_norm": 1.1212217807769775,
"learning_rate": 2.25e-05,
"loss": 0.0265,
"num_input_tokens_seen": 17694720,
"step": 540
},
{
"epoch": 0.5852620377760043,
"grad_norm": 0.638861358165741,
"learning_rate": 2.2331838565022424e-05,
"loss": 0.041,
"num_input_tokens_seen": 18022400,
"step": 550
},
{
"epoch": 0.595903165735568,
"grad_norm": 0.8384909629821777,
"learning_rate": 2.2163677130044843e-05,
"loss": 0.0377,
"num_input_tokens_seen": 18350080,
"step": 560
},
{
"epoch": 0.6065442936951316,
"grad_norm": 2.6054413318634033,
"learning_rate": 2.1995515695067265e-05,
"loss": 0.0621,
"num_input_tokens_seen": 18677760,
"step": 570
},
{
"epoch": 0.6171854216546954,
"grad_norm": 0.05188291519880295,
"learning_rate": 2.1827354260089687e-05,
"loss": 0.0089,
"num_input_tokens_seen": 19005440,
"step": 580
},
{
"epoch": 0.6278265496142591,
"grad_norm": 6.18527889251709,
"learning_rate": 2.1659192825112106e-05,
"loss": 0.0623,
"num_input_tokens_seen": 19333120,
"step": 590
},
{
"epoch": 0.6384676775738228,
"grad_norm": 4.499662399291992,
"learning_rate": 2.149103139013453e-05,
"loss": 0.0413,
"num_input_tokens_seen": 19660800,
"step": 600
},
{
"epoch": 0.6491088055333866,
"grad_norm": 0.06525593250989914,
"learning_rate": 2.1322869955156954e-05,
"loss": 0.0268,
"num_input_tokens_seen": 19988480,
"step": 610
},
{
"epoch": 0.6597499334929503,
"grad_norm": 0.7937769889831543,
"learning_rate": 2.1154708520179373e-05,
"loss": 0.0294,
"num_input_tokens_seen": 20316160,
"step": 620
},
{
"epoch": 0.6703910614525139,
"grad_norm": 0.42232292890548706,
"learning_rate": 2.0986547085201796e-05,
"loss": 0.0086,
"num_input_tokens_seen": 20643840,
"step": 630
},
{
"epoch": 0.6810321894120777,
"grad_norm": 0.23680944740772247,
"learning_rate": 2.0818385650224215e-05,
"loss": 0.0182,
"num_input_tokens_seen": 20971520,
"step": 640
},
{
"epoch": 0.6916733173716414,
"grad_norm": 0.8892483115196228,
"learning_rate": 2.0650224215246637e-05,
"loss": 0.0158,
"num_input_tokens_seen": 21299200,
"step": 650
},
{
"epoch": 0.7023144453312051,
"grad_norm": 9.271723747253418,
"learning_rate": 2.048206278026906e-05,
"loss": 0.0332,
"num_input_tokens_seen": 21626880,
"step": 660
},
{
"epoch": 0.7129555732907689,
"grad_norm": 0.681903600692749,
"learning_rate": 2.0313901345291478e-05,
"loss": 0.0402,
"num_input_tokens_seen": 21954560,
"step": 670
},
{
"epoch": 0.7235967012503325,
"grad_norm": 2.4827804565429688,
"learning_rate": 2.01457399103139e-05,
"loss": 0.0297,
"num_input_tokens_seen": 22282240,
"step": 680
},
{
"epoch": 0.7342378292098962,
"grad_norm": 2.727994203567505,
"learning_rate": 1.9977578475336323e-05,
"loss": 0.027,
"num_input_tokens_seen": 22609920,
"step": 690
},
{
"epoch": 0.74487895716946,
"grad_norm": 1.978765845298767,
"learning_rate": 1.9809417040358745e-05,
"loss": 0.0279,
"num_input_tokens_seen": 22937600,
"step": 700
},
{
"epoch": 0.7555200851290237,
"grad_norm": 2.512544870376587,
"learning_rate": 1.9641255605381167e-05,
"loss": 0.0323,
"num_input_tokens_seen": 23265280,
"step": 710
},
{
"epoch": 0.7661612130885874,
"grad_norm": 5.157982349395752,
"learning_rate": 1.947309417040359e-05,
"loss": 0.0514,
"num_input_tokens_seen": 23592960,
"step": 720
},
{
"epoch": 0.7768023410481512,
"grad_norm": 0.037381790578365326,
"learning_rate": 1.930493273542601e-05,
"loss": 0.0077,
"num_input_tokens_seen": 23920640,
"step": 730
},
{
"epoch": 0.7874434690077148,
"grad_norm": 1.0004149675369263,
"learning_rate": 1.913677130044843e-05,
"loss": 0.0315,
"num_input_tokens_seen": 24248320,
"step": 740
},
{
"epoch": 0.7980845969672785,
"grad_norm": 0.046527761965990067,
"learning_rate": 1.8968609865470853e-05,
"loss": 0.02,
"num_input_tokens_seen": 24576000,
"step": 750
},
{
"epoch": 0.7980845969672785,
"eval_accuracy": 0.99,
"eval_loss": 0.02980552613735199,
"eval_runtime": 1.1295,
"eval_samples_per_second": 442.672,
"eval_steps_per_second": 55.777,
"num_input_tokens_seen": 24576000,
"step": 750
},
{
"epoch": 0.8087257249268422,
"grad_norm": 0.3098304867744446,
"learning_rate": 1.8800448430493272e-05,
"loss": 0.02,
"num_input_tokens_seen": 24903680,
"step": 760
},
{
"epoch": 0.819366852886406,
"grad_norm": 1.8411376476287842,
"learning_rate": 1.8632286995515695e-05,
"loss": 0.0219,
"num_input_tokens_seen": 25231360,
"step": 770
},
{
"epoch": 0.8300079808459697,
"grad_norm": 0.6672658920288086,
"learning_rate": 1.8464125560538117e-05,
"loss": 0.0236,
"num_input_tokens_seen": 25559040,
"step": 780
},
{
"epoch": 0.8406491088055333,
"grad_norm": 0.15667960047721863,
"learning_rate": 1.829596412556054e-05,
"loss": 0.0373,
"num_input_tokens_seen": 25886720,
"step": 790
},
{
"epoch": 0.8512902367650971,
"grad_norm": 0.039243053644895554,
"learning_rate": 1.812780269058296e-05,
"loss": 0.0118,
"num_input_tokens_seen": 26214400,
"step": 800
},
{
"epoch": 0.8619313647246608,
"grad_norm": 0.9345981478691101,
"learning_rate": 1.795964125560538e-05,
"loss": 0.0322,
"num_input_tokens_seen": 26542080,
"step": 810
},
{
"epoch": 0.8725724926842245,
"grad_norm": 0.06790352612733841,
"learning_rate": 1.7791479820627803e-05,
"loss": 0.0097,
"num_input_tokens_seen": 26869760,
"step": 820
},
{
"epoch": 0.8832136206437883,
"grad_norm": 0.065700002014637,
"learning_rate": 1.7623318385650225e-05,
"loss": 0.0188,
"num_input_tokens_seen": 27197440,
"step": 830
},
{
"epoch": 0.8938547486033519,
"grad_norm": 3.7558648586273193,
"learning_rate": 1.7455156950672644e-05,
"loss": 0.0253,
"num_input_tokens_seen": 27525120,
"step": 840
},
{
"epoch": 0.9044958765629156,
"grad_norm": 4.746110916137695,
"learning_rate": 1.7286995515695067e-05,
"loss": 0.0171,
"num_input_tokens_seen": 27852800,
"step": 850
},
{
"epoch": 0.9151370045224794,
"grad_norm": 0.26326820254325867,
"learning_rate": 1.711883408071749e-05,
"loss": 0.0236,
"num_input_tokens_seen": 28180480,
"step": 860
},
{
"epoch": 0.9257781324820431,
"grad_norm": 0.10672000050544739,
"learning_rate": 1.695067264573991e-05,
"loss": 0.0085,
"num_input_tokens_seen": 28508160,
"step": 870
},
{
"epoch": 0.9364192604416068,
"grad_norm": 0.16295024752616882,
"learning_rate": 1.6782511210762334e-05,
"loss": 0.0137,
"num_input_tokens_seen": 28835840,
"step": 880
},
{
"epoch": 0.9470603884011706,
"grad_norm": 4.8795857429504395,
"learning_rate": 1.6614349775784756e-05,
"loss": 0.0305,
"num_input_tokens_seen": 29163520,
"step": 890
},
{
"epoch": 0.9577015163607342,
"grad_norm": 0.06518769264221191,
"learning_rate": 1.6446188340807175e-05,
"loss": 0.0117,
"num_input_tokens_seen": 29491200,
"step": 900
},
{
"epoch": 0.9683426443202979,
"grad_norm": 1.4961518049240112,
"learning_rate": 1.6278026905829597e-05,
"loss": 0.0359,
"num_input_tokens_seen": 29818880,
"step": 910
},
{
"epoch": 0.9789837722798617,
"grad_norm": 1.2783812284469604,
"learning_rate": 1.610986547085202e-05,
"loss": 0.0405,
"num_input_tokens_seen": 30146560,
"step": 920
},
{
"epoch": 0.9896249002394254,
"grad_norm": 0.15925170481204987,
"learning_rate": 1.594170403587444e-05,
"loss": 0.0356,
"num_input_tokens_seen": 30474240,
"step": 930
},
{
"epoch": 1.000266028198989,
"grad_norm": 1.536391019821167,
"learning_rate": 1.577354260089686e-05,
"loss": 0.0159,
"num_input_tokens_seen": 30799872,
"step": 940
},
{
"epoch": 1.0109071561585528,
"grad_norm": 0.04294372722506523,
"learning_rate": 1.560538116591928e-05,
"loss": 0.0437,
"num_input_tokens_seen": 31127552,
"step": 950
},
{
"epoch": 1.0215482841181165,
"grad_norm": 0.13462825119495392,
"learning_rate": 1.5437219730941705e-05,
"loss": 0.0129,
"num_input_tokens_seen": 31455232,
"step": 960
},
{
"epoch": 1.0321894120776802,
"grad_norm": 0.03951927274465561,
"learning_rate": 1.5269058295964128e-05,
"loss": 0.017,
"num_input_tokens_seen": 31782912,
"step": 970
},
{
"epoch": 1.042830540037244,
"grad_norm": 0.12142454832792282,
"learning_rate": 1.5100896860986547e-05,
"loss": 0.0207,
"num_input_tokens_seen": 32110592,
"step": 980
},
{
"epoch": 1.0534716679968077,
"grad_norm": 0.11652370542287827,
"learning_rate": 1.4932735426008969e-05,
"loss": 0.0176,
"num_input_tokens_seen": 32438272,
"step": 990
},
{
"epoch": 1.0641127959563714,
"grad_norm": 4.033369064331055,
"learning_rate": 1.476457399103139e-05,
"loss": 0.0085,
"num_input_tokens_seen": 32765952,
"step": 1000
},
{
"epoch": 1.0641127959563714,
"eval_accuracy": 0.994,
"eval_loss": 0.022239448502659798,
"eval_runtime": 1.1241,
"eval_samples_per_second": 444.814,
"eval_steps_per_second": 56.047,
"num_input_tokens_seen": 32765952,
"step": 1000
},
{
"epoch": 1.0747539239159352,
"grad_norm": 0.10022466629743576,
"learning_rate": 1.4596412556053812e-05,
"loss": 0.0196,
"num_input_tokens_seen": 33093632,
"step": 1010
},
{
"epoch": 1.085395051875499,
"grad_norm": 0.0608280785381794,
"learning_rate": 1.4428251121076234e-05,
"loss": 0.0244,
"num_input_tokens_seen": 33421312,
"step": 1020
},
{
"epoch": 1.0960361798350626,
"grad_norm": 0.6638007164001465,
"learning_rate": 1.4260089686098655e-05,
"loss": 0.0049,
"num_input_tokens_seen": 33748992,
"step": 1030
},
{
"epoch": 1.1066773077946261,
"grad_norm": 0.17382824420928955,
"learning_rate": 1.4091928251121077e-05,
"loss": 0.0106,
"num_input_tokens_seen": 34076672,
"step": 1040
},
{
"epoch": 1.1173184357541899,
"grad_norm": 0.10657654702663422,
"learning_rate": 1.3923766816143498e-05,
"loss": 0.0381,
"num_input_tokens_seen": 34404352,
"step": 1050
},
{
"epoch": 1.1279595637137536,
"grad_norm": 0.7529979348182678,
"learning_rate": 1.375560538116592e-05,
"loss": 0.0235,
"num_input_tokens_seen": 34732032,
"step": 1060
},
{
"epoch": 1.1386006916733173,
"grad_norm": 0.07195574790239334,
"learning_rate": 1.358744394618834e-05,
"loss": 0.0173,
"num_input_tokens_seen": 35059712,
"step": 1070
},
{
"epoch": 1.149241819632881,
"grad_norm": 0.8922456502914429,
"learning_rate": 1.3419282511210763e-05,
"loss": 0.0201,
"num_input_tokens_seen": 35387392,
"step": 1080
},
{
"epoch": 1.1598829475924448,
"grad_norm": 0.2780587375164032,
"learning_rate": 1.3251121076233184e-05,
"loss": 0.0071,
"num_input_tokens_seen": 35715072,
"step": 1090
},
{
"epoch": 1.1705240755520085,
"grad_norm": 0.014401647262275219,
"learning_rate": 1.3082959641255604e-05,
"loss": 0.0025,
"num_input_tokens_seen": 36042752,
"step": 1100
},
{
"epoch": 1.1811652035115723,
"grad_norm": 0.07402833551168442,
"learning_rate": 1.2914798206278028e-05,
"loss": 0.0038,
"num_input_tokens_seen": 36370432,
"step": 1110
},
{
"epoch": 1.191806331471136,
"grad_norm": 0.035160522907972336,
"learning_rate": 1.2746636771300449e-05,
"loss": 0.0221,
"num_input_tokens_seen": 36698112,
"step": 1120
},
{
"epoch": 1.2024474594306997,
"grad_norm": 0.23754417896270752,
"learning_rate": 1.257847533632287e-05,
"loss": 0.0044,
"num_input_tokens_seen": 37025792,
"step": 1130
},
{
"epoch": 1.2130885873902635,
"grad_norm": 0.07629762589931488,
"learning_rate": 1.241031390134529e-05,
"loss": 0.0119,
"num_input_tokens_seen": 37353472,
"step": 1140
},
{
"epoch": 1.223729715349827,
"grad_norm": 0.23725423216819763,
"learning_rate": 1.2242152466367714e-05,
"loss": 0.0279,
"num_input_tokens_seen": 37681152,
"step": 1150
},
{
"epoch": 1.2343708433093907,
"grad_norm": 1.0171340703964233,
"learning_rate": 1.2073991031390135e-05,
"loss": 0.0531,
"num_input_tokens_seen": 38008832,
"step": 1160
},
{
"epoch": 1.2450119712689545,
"grad_norm": 0.016075875610113144,
"learning_rate": 1.1905829596412556e-05,
"loss": 0.0261,
"num_input_tokens_seen": 38336512,
"step": 1170
},
{
"epoch": 1.2556530992285182,
"grad_norm": 0.8257108330726624,
"learning_rate": 1.1737668161434978e-05,
"loss": 0.0166,
"num_input_tokens_seen": 38664192,
"step": 1180
},
{
"epoch": 1.266294227188082,
"grad_norm": 0.0884622186422348,
"learning_rate": 1.15695067264574e-05,
"loss": 0.0077,
"num_input_tokens_seen": 38991872,
"step": 1190
},
{
"epoch": 1.2769353551476457,
"grad_norm": 0.101267971098423,
"learning_rate": 1.1401345291479821e-05,
"loss": 0.019,
"num_input_tokens_seen": 39319552,
"step": 1200
},
{
"epoch": 1.2875764831072094,
"grad_norm": 2.194119691848755,
"learning_rate": 1.1233183856502243e-05,
"loss": 0.0131,
"num_input_tokens_seen": 39647232,
"step": 1210
},
{
"epoch": 1.2982176110667731,
"grad_norm": 2.7684483528137207,
"learning_rate": 1.1065022421524664e-05,
"loss": 0.0076,
"num_input_tokens_seen": 39974912,
"step": 1220
},
{
"epoch": 1.3088587390263369,
"grad_norm": 2.1547205448150635,
"learning_rate": 1.0896860986547085e-05,
"loss": 0.0242,
"num_input_tokens_seen": 40302592,
"step": 1230
},
{
"epoch": 1.3194998669859004,
"grad_norm": 0.39225855469703674,
"learning_rate": 1.0728699551569507e-05,
"loss": 0.013,
"num_input_tokens_seen": 40630272,
"step": 1240
},
{
"epoch": 1.3301409949454643,
"grad_norm": 0.12444789707660675,
"learning_rate": 1.056053811659193e-05,
"loss": 0.0174,
"num_input_tokens_seen": 40957952,
"step": 1250
},
{
"epoch": 1.3301409949454643,
"eval_accuracy": 0.994,
"eval_loss": 0.020717209205031395,
"eval_runtime": 1.1258,
"eval_samples_per_second": 444.121,
"eval_steps_per_second": 55.959,
"num_input_tokens_seen": 40957952,
"step": 1250
},
{
"epoch": 1.3407821229050279,
"grad_norm": 0.224708691239357,
"learning_rate": 1.039237668161435e-05,
"loss": 0.0087,
"num_input_tokens_seen": 41285632,
"step": 1260
},
{
"epoch": 1.3514232508645916,
"grad_norm": 0.08499462902545929,
"learning_rate": 1.022421524663677e-05,
"loss": 0.0182,
"num_input_tokens_seen": 41613312,
"step": 1270
},
{
"epoch": 1.3620643788241553,
"grad_norm": 0.05140333250164986,
"learning_rate": 1.0056053811659195e-05,
"loss": 0.0034,
"num_input_tokens_seen": 41940992,
"step": 1280
},
{
"epoch": 1.372705506783719,
"grad_norm": 0.05546234920620918,
"learning_rate": 9.887892376681615e-06,
"loss": 0.0117,
"num_input_tokens_seen": 42268672,
"step": 1290
},
{
"epoch": 1.3833466347432828,
"grad_norm": 0.029206566512584686,
"learning_rate": 9.719730941704036e-06,
"loss": 0.0179,
"num_input_tokens_seen": 42596352,
"step": 1300
},
{
"epoch": 1.3939877627028465,
"grad_norm": 0.3235812485218048,
"learning_rate": 9.551569506726456e-06,
"loss": 0.0333,
"num_input_tokens_seen": 42924032,
"step": 1310
},
{
"epoch": 1.4046288906624103,
"grad_norm": 4.916908264160156,
"learning_rate": 9.38340807174888e-06,
"loss": 0.0167,
"num_input_tokens_seen": 43251712,
"step": 1320
},
{
"epoch": 1.415270018621974,
"grad_norm": 0.10124430060386658,
"learning_rate": 9.215246636771301e-06,
"loss": 0.0299,
"num_input_tokens_seen": 43579392,
"step": 1330
},
{
"epoch": 1.4259111465815377,
"grad_norm": 0.09930448234081268,
"learning_rate": 9.047085201793722e-06,
"loss": 0.0112,
"num_input_tokens_seen": 43907072,
"step": 1340
},
{
"epoch": 1.4365522745411012,
"grad_norm": 0.1370278298854828,
"learning_rate": 8.878923766816144e-06,
"loss": 0.0105,
"num_input_tokens_seen": 44234752,
"step": 1350
},
{
"epoch": 1.4471934025006652,
"grad_norm": 1.9884629249572754,
"learning_rate": 8.710762331838565e-06,
"loss": 0.0093,
"num_input_tokens_seen": 44562432,
"step": 1360
},
{
"epoch": 1.4578345304602287,
"grad_norm": 0.768826961517334,
"learning_rate": 8.542600896860987e-06,
"loss": 0.0297,
"num_input_tokens_seen": 44890112,
"step": 1370
},
{
"epoch": 1.4684756584197924,
"grad_norm": 0.08758696168661118,
"learning_rate": 8.374439461883408e-06,
"loss": 0.0234,
"num_input_tokens_seen": 45217792,
"step": 1380
},
{
"epoch": 1.4791167863793562,
"grad_norm": 0.1405934989452362,
"learning_rate": 8.20627802690583e-06,
"loss": 0.0072,
"num_input_tokens_seen": 45545472,
"step": 1390
},
{
"epoch": 1.48975791433892,
"grad_norm": 0.32703763246536255,
"learning_rate": 8.03811659192825e-06,
"loss": 0.0023,
"num_input_tokens_seen": 45873152,
"step": 1400
},
{
"epoch": 1.5003990422984836,
"grad_norm": 0.8952039480209351,
"learning_rate": 7.869955156950673e-06,
"loss": 0.0183,
"num_input_tokens_seen": 46200832,
"step": 1410
},
{
"epoch": 1.5110401702580474,
"grad_norm": 0.2962280213832855,
"learning_rate": 7.701793721973095e-06,
"loss": 0.0013,
"num_input_tokens_seen": 46528512,
"step": 1420
},
{
"epoch": 1.5216812982176111,
"grad_norm": 2.0377979278564453,
"learning_rate": 7.533632286995516e-06,
"loss": 0.0195,
"num_input_tokens_seen": 46856192,
"step": 1430
},
{
"epoch": 1.5323224261771746,
"grad_norm": 0.08011902123689651,
"learning_rate": 7.365470852017937e-06,
"loss": 0.0065,
"num_input_tokens_seen": 47183872,
"step": 1440
},
{
"epoch": 1.5429635541367386,
"grad_norm": 0.07826100289821625,
"learning_rate": 7.197309417040359e-06,
"loss": 0.0203,
"num_input_tokens_seen": 47511552,
"step": 1450
},
{
"epoch": 1.553604682096302,
"grad_norm": 0.08626201748847961,
"learning_rate": 7.02914798206278e-06,
"loss": 0.0123,
"num_input_tokens_seen": 47839232,
"step": 1460
},
{
"epoch": 1.564245810055866,
"grad_norm": 1.227737545967102,
"learning_rate": 6.860986547085202e-06,
"loss": 0.0159,
"num_input_tokens_seen": 48166912,
"step": 1470
},
{
"epoch": 1.5748869380154296,
"grad_norm": 0.45808491110801697,
"learning_rate": 6.692825112107623e-06,
"loss": 0.0182,
"num_input_tokens_seen": 48494592,
"step": 1480
},
{
"epoch": 1.5855280659749933,
"grad_norm": 0.19725441932678223,
"learning_rate": 6.524663677130045e-06,
"loss": 0.011,
"num_input_tokens_seen": 48822272,
"step": 1490
},
{
"epoch": 1.596169193934557,
"grad_norm": 0.11997473984956741,
"learning_rate": 6.356502242152466e-06,
"loss": 0.0104,
"num_input_tokens_seen": 49149952,
"step": 1500
},
{
"epoch": 1.596169193934557,
"eval_accuracy": 0.996,
"eval_loss": 0.02015475556254387,
"eval_runtime": 1.1247,
"eval_samples_per_second": 444.581,
"eval_steps_per_second": 56.017,
"num_input_tokens_seen": 49149952,
"step": 1500
},
{
"epoch": 1.6068103218941208,
"grad_norm": 0.08161328732967377,
"learning_rate": 6.188340807174889e-06,
"loss": 0.011,
"num_input_tokens_seen": 49477632,
"step": 1510
},
{
"epoch": 1.6174514498536845,
"grad_norm": 0.04879956319928169,
"learning_rate": 6.020179372197309e-06,
"loss": 0.0034,
"num_input_tokens_seen": 49805312,
"step": 1520
},
{
"epoch": 1.6280925778132482,
"grad_norm": 0.2356010526418686,
"learning_rate": 5.8520179372197316e-06,
"loss": 0.0305,
"num_input_tokens_seen": 50132992,
"step": 1530
},
{
"epoch": 1.638733705772812,
"grad_norm": 0.08499031513929367,
"learning_rate": 5.683856502242152e-06,
"loss": 0.0106,
"num_input_tokens_seen": 50460672,
"step": 1540
},
{
"epoch": 1.6493748337323755,
"grad_norm": 0.10495586693286896,
"learning_rate": 5.5156950672645745e-06,
"loss": 0.012,
"num_input_tokens_seen": 50788352,
"step": 1550
},
{
"epoch": 1.6600159616919394,
"grad_norm": 0.09235712140798569,
"learning_rate": 5.347533632286995e-06,
"loss": 0.0017,
"num_input_tokens_seen": 51116032,
"step": 1560
},
{
"epoch": 1.670657089651503,
"grad_norm": 0.04202970489859581,
"learning_rate": 5.1793721973094175e-06,
"loss": 0.0172,
"num_input_tokens_seen": 51443712,
"step": 1570
},
{
"epoch": 1.681298217611067,
"grad_norm": 3.6560862064361572,
"learning_rate": 5.011210762331839e-06,
"loss": 0.0259,
"num_input_tokens_seen": 51771392,
"step": 1580
},
{
"epoch": 1.6919393455706304,
"grad_norm": 0.20075471699237823,
"learning_rate": 4.8430493273542605e-06,
"loss": 0.0144,
"num_input_tokens_seen": 52099072,
"step": 1590
},
{
"epoch": 1.7025804735301941,
"grad_norm": 0.14858105778694153,
"learning_rate": 4.674887892376682e-06,
"loss": 0.0099,
"num_input_tokens_seen": 52426752,
"step": 1600
},
{
"epoch": 1.7132216014897579,
"grad_norm": 0.08154450356960297,
"learning_rate": 4.506726457399103e-06,
"loss": 0.0155,
"num_input_tokens_seen": 52754432,
"step": 1610
},
{
"epoch": 1.7238627294493216,
"grad_norm": 0.030162209644913673,
"learning_rate": 4.338565022421525e-06,
"loss": 0.0087,
"num_input_tokens_seen": 53082112,
"step": 1620
},
{
"epoch": 1.7345038574088854,
"grad_norm": 0.058421239256858826,
"learning_rate": 4.170403587443946e-06,
"loss": 0.0205,
"num_input_tokens_seen": 53409792,
"step": 1630
},
{
"epoch": 1.745144985368449,
"grad_norm": 0.9610540270805359,
"learning_rate": 4.002242152466368e-06,
"loss": 0.0084,
"num_input_tokens_seen": 53737472,
"step": 1640
},
{
"epoch": 1.7557861133280128,
"grad_norm": 0.3001765310764313,
"learning_rate": 3.834080717488789e-06,
"loss": 0.0154,
"num_input_tokens_seen": 54065152,
"step": 1650
},
{
"epoch": 1.7664272412875763,
"grad_norm": 0.07005713880062103,
"learning_rate": 3.665919282511211e-06,
"loss": 0.0166,
"num_input_tokens_seen": 54392832,
"step": 1660
},
{
"epoch": 1.7770683692471403,
"grad_norm": 0.044125888496637344,
"learning_rate": 3.4977578475336323e-06,
"loss": 0.0016,
"num_input_tokens_seen": 54720512,
"step": 1670
},
{
"epoch": 1.7877094972067038,
"grad_norm": 1.5570340156555176,
"learning_rate": 3.329596412556054e-06,
"loss": 0.0208,
"num_input_tokens_seen": 55048192,
"step": 1680
},
{
"epoch": 1.7983506251662678,
"grad_norm": 0.12797504663467407,
"learning_rate": 3.1614349775784753e-06,
"loss": 0.0127,
"num_input_tokens_seen": 55375872,
"step": 1690
},
{
"epoch": 1.8089917531258313,
"grad_norm": 0.12429122626781464,
"learning_rate": 2.9932735426008968e-06,
"loss": 0.0015,
"num_input_tokens_seen": 55703552,
"step": 1700
},
{
"epoch": 1.819632881085395,
"grad_norm": 0.15149074792861938,
"learning_rate": 2.8251121076233182e-06,
"loss": 0.0083,
"num_input_tokens_seen": 56031232,
"step": 1710
},
{
"epoch": 1.8302740090449587,
"grad_norm": 0.10725903511047363,
"learning_rate": 2.65695067264574e-06,
"loss": 0.0071,
"num_input_tokens_seen": 56358912,
"step": 1720
},
{
"epoch": 1.8409151370045225,
"grad_norm": 0.1267658919095993,
"learning_rate": 2.4887892376681616e-06,
"loss": 0.0087,
"num_input_tokens_seen": 56686592,
"step": 1730
},
{
"epoch": 1.8515562649640862,
"grad_norm": 0.35703355073928833,
"learning_rate": 2.320627802690583e-06,
"loss": 0.0068,
"num_input_tokens_seen": 57014272,
"step": 1740
},
{
"epoch": 1.86219739292365,
"grad_norm": 0.7102775573730469,
"learning_rate": 2.1524663677130046e-06,
"loss": 0.0237,
"num_input_tokens_seen": 57341952,
"step": 1750
},
{
"epoch": 1.86219739292365,
"eval_accuracy": 0.996,
"eval_loss": 0.018471572548151016,
"eval_runtime": 1.1252,
"eval_samples_per_second": 444.377,
"eval_steps_per_second": 55.992,
"num_input_tokens_seen": 57341952,
"step": 1750
},
{
"epoch": 1.8728385208832137,
"grad_norm": 0.04301352798938751,
"learning_rate": 1.984304932735426e-06,
"loss": 0.0102,
"num_input_tokens_seen": 57669632,
"step": 1760
},
{
"epoch": 1.8834796488427772,
"grad_norm": 0.12998220324516296,
"learning_rate": 1.8161434977578476e-06,
"loss": 0.034,
"num_input_tokens_seen": 57997312,
"step": 1770
},
{
"epoch": 1.8941207768023411,
"grad_norm": 0.05428827181458473,
"learning_rate": 1.647982062780269e-06,
"loss": 0.0034,
"num_input_tokens_seen": 58324992,
"step": 1780
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.031001785770058632,
"learning_rate": 1.4798206278026905e-06,
"loss": 0.0201,
"num_input_tokens_seen": 58652672,
"step": 1790
},
{
"epoch": 1.9154030327214686,
"grad_norm": 0.06974712759256363,
"learning_rate": 1.3116591928251122e-06,
"loss": 0.0073,
"num_input_tokens_seen": 58980352,
"step": 1800
},
{
"epoch": 1.9260441606810321,
"grad_norm": 0.028872903436422348,
"learning_rate": 1.1434977578475337e-06,
"loss": 0.0201,
"num_input_tokens_seen": 59308032,
"step": 1810
},
{
"epoch": 1.9366852886405959,
"grad_norm": 0.04791630432009697,
"learning_rate": 9.75336322869955e-07,
"loss": 0.0365,
"num_input_tokens_seen": 59635712,
"step": 1820
},
{
"epoch": 1.9473264166001596,
"grad_norm": 0.9329636096954346,
"learning_rate": 8.071748878923768e-07,
"loss": 0.0041,
"num_input_tokens_seen": 59963392,
"step": 1830
},
{
"epoch": 1.9579675445597233,
"grad_norm": 0.2609878182411194,
"learning_rate": 6.390134529147982e-07,
"loss": 0.0191,
"num_input_tokens_seen": 60291072,
"step": 1840
},
{
"epoch": 1.968608672519287,
"grad_norm": 1.2760034799575806,
"learning_rate": 4.7085201793721974e-07,
"loss": 0.006,
"num_input_tokens_seen": 60618752,
"step": 1850
},
{
"epoch": 1.9792498004788508,
"grad_norm": 0.5698215961456299,
"learning_rate": 3.026905829596413e-07,
"loss": 0.0106,
"num_input_tokens_seen": 60946432,
"step": 1860
},
{
"epoch": 1.9898909284384145,
"grad_norm": 0.08324664831161499,
"learning_rate": 1.345291479820628e-07,
"loss": 0.0096,
"num_input_tokens_seen": 61274112,
"step": 1870
},
{
"epoch": 1.9984038308060654,
"num_input_tokens_seen": 61536256,
"step": 1878,
"total_flos": 3986132331896832.0,
"train_loss": 0.0478181641492338,
"train_runtime": 541.4691,
"train_samples_per_second": 222.136,
"train_steps_per_second": 3.468
}
],
"logging_steps": 10,
"max_steps": 1878,
"num_input_tokens_seen": 61536256,
"num_train_epochs": 2,
"save_steps": 400,
"total_flos": 3986132331896832.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}