gpt2-50000 / trainer_state.json
Hwijeen's picture
Upload 13 files
19b1e18 verified
raw
history blame contribute delete
No virus
17.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.944596361907027,
"eval_steps": 500,
"global_step": 50000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.059445963619070265,
"grad_norm": 2.0723960399627686,
"learning_rate": 4.985138509095233e-05,
"loss": 3.6788,
"step": 500
},
{
"epoch": 0.11889192723814053,
"grad_norm": 1.9278995990753174,
"learning_rate": 4.970277018190465e-05,
"loss": 3.4742,
"step": 1000
},
{
"epoch": 0.1783378908572108,
"grad_norm": 1.4848977327346802,
"learning_rate": 4.955415527285698e-05,
"loss": 3.3942,
"step": 1500
},
{
"epoch": 0.23778385447628106,
"grad_norm": 1.3492341041564941,
"learning_rate": 4.94055403638093e-05,
"loss": 3.3358,
"step": 2000
},
{
"epoch": 0.2972298180953513,
"grad_norm": 1.212128758430481,
"learning_rate": 4.925692545476163e-05,
"loss": 3.2851,
"step": 2500
},
{
"epoch": 0.3566757817144216,
"grad_norm": 1.1597293615341187,
"learning_rate": 4.9108310545713945e-05,
"loss": 3.2331,
"step": 3000
},
{
"epoch": 0.41612174533349183,
"grad_norm": 0.9653922319412231,
"learning_rate": 4.8959695636666275e-05,
"loss": 3.2339,
"step": 3500
},
{
"epoch": 0.4755677089525621,
"grad_norm": 1.0085793733596802,
"learning_rate": 4.88110807276186e-05,
"loss": 3.1856,
"step": 4000
},
{
"epoch": 0.5350136725716323,
"grad_norm": 1.0556505918502808,
"learning_rate": 4.866246581857092e-05,
"loss": 3.1748,
"step": 4500
},
{
"epoch": 0.5944596361907026,
"grad_norm": 0.9526228904724121,
"learning_rate": 4.851385090952324e-05,
"loss": 3.1529,
"step": 5000
},
{
"epoch": 0.6539055998097729,
"grad_norm": 0.984980046749115,
"learning_rate": 4.836523600047557e-05,
"loss": 3.1378,
"step": 5500
},
{
"epoch": 0.7133515634288432,
"grad_norm": 1.0135027170181274,
"learning_rate": 4.8216621091427895e-05,
"loss": 3.0848,
"step": 6000
},
{
"epoch": 0.7727975270479135,
"grad_norm": 0.9454924464225769,
"learning_rate": 4.806800618238022e-05,
"loss": 3.0916,
"step": 6500
},
{
"epoch": 0.8322434906669837,
"grad_norm": 0.9793129563331604,
"learning_rate": 4.791939127333254e-05,
"loss": 3.0642,
"step": 7000
},
{
"epoch": 0.891689454286054,
"grad_norm": 0.9016062617301941,
"learning_rate": 4.777077636428487e-05,
"loss": 3.0657,
"step": 7500
},
{
"epoch": 0.9511354179051242,
"grad_norm": 0.8690605163574219,
"learning_rate": 4.762216145523719e-05,
"loss": 3.0281,
"step": 8000
},
{
"epoch": 1.0105813815241944,
"grad_norm": 0.891808271408081,
"learning_rate": 4.7473546546189516e-05,
"loss": 3.0155,
"step": 8500
},
{
"epoch": 1.0700273451432647,
"grad_norm": 0.9521974325180054,
"learning_rate": 4.732493163714184e-05,
"loss": 2.9713,
"step": 9000
},
{
"epoch": 1.129473308762335,
"grad_norm": 0.9132643938064575,
"learning_rate": 4.717631672809417e-05,
"loss": 2.9663,
"step": 9500
},
{
"epoch": 1.1889192723814053,
"grad_norm": 0.909182608127594,
"learning_rate": 4.702770181904649e-05,
"loss": 2.9616,
"step": 10000
},
{
"epoch": 1.2483652360004756,
"grad_norm": 0.912726104259491,
"learning_rate": 4.687908690999881e-05,
"loss": 2.9653,
"step": 10500
},
{
"epoch": 1.3078111996195458,
"grad_norm": 0.8568936586380005,
"learning_rate": 4.6730472000951136e-05,
"loss": 2.9486,
"step": 11000
},
{
"epoch": 1.3672571632386161,
"grad_norm": 0.9120291471481323,
"learning_rate": 4.6581857091903465e-05,
"loss": 2.932,
"step": 11500
},
{
"epoch": 1.4267031268576864,
"grad_norm": 0.981961190700531,
"learning_rate": 4.643324218285579e-05,
"loss": 2.9345,
"step": 12000
},
{
"epoch": 1.4861490904767567,
"grad_norm": 0.9763424396514893,
"learning_rate": 4.628462727380811e-05,
"loss": 2.9193,
"step": 12500
},
{
"epoch": 1.545595054095827,
"grad_norm": 0.8868328332901001,
"learning_rate": 4.6136012364760434e-05,
"loss": 2.9164,
"step": 13000
},
{
"epoch": 1.605041017714897,
"grad_norm": 0.9175488352775574,
"learning_rate": 4.598739745571276e-05,
"loss": 2.8932,
"step": 13500
},
{
"epoch": 1.6644869813339676,
"grad_norm": 0.890186607837677,
"learning_rate": 4.583878254666508e-05,
"loss": 2.8933,
"step": 14000
},
{
"epoch": 1.7239329449530376,
"grad_norm": 0.9198343753814697,
"learning_rate": 4.569016763761741e-05,
"loss": 2.881,
"step": 14500
},
{
"epoch": 1.783378908572108,
"grad_norm": 0.9706104397773743,
"learning_rate": 4.554155272856973e-05,
"loss": 2.8705,
"step": 15000
},
{
"epoch": 1.8428248721911782,
"grad_norm": 0.9355807304382324,
"learning_rate": 4.539293781952206e-05,
"loss": 2.8601,
"step": 15500
},
{
"epoch": 1.9022708358102485,
"grad_norm": 0.8972137570381165,
"learning_rate": 4.524432291047438e-05,
"loss": 2.8632,
"step": 16000
},
{
"epoch": 1.9617167994293188,
"grad_norm": 0.8553013801574707,
"learning_rate": 4.5095708001426706e-05,
"loss": 2.8696,
"step": 16500
},
{
"epoch": 2.021162763048389,
"grad_norm": 0.8952363133430481,
"learning_rate": 4.494709309237903e-05,
"loss": 2.8541,
"step": 17000
},
{
"epoch": 2.0806087266674593,
"grad_norm": 0.8947279453277588,
"learning_rate": 4.479847818333135e-05,
"loss": 2.8203,
"step": 17500
},
{
"epoch": 2.1400546902865294,
"grad_norm": 0.8680304884910583,
"learning_rate": 4.4649863274283674e-05,
"loss": 2.8088,
"step": 18000
},
{
"epoch": 2.1995006539056,
"grad_norm": 0.8425644040107727,
"learning_rate": 4.4501248365236004e-05,
"loss": 2.8064,
"step": 18500
},
{
"epoch": 2.25894661752467,
"grad_norm": 0.9474213719367981,
"learning_rate": 4.4352633456188327e-05,
"loss": 2.7851,
"step": 19000
},
{
"epoch": 2.3183925811437405,
"grad_norm": 0.9292487502098083,
"learning_rate": 4.420401854714065e-05,
"loss": 2.8062,
"step": 19500
},
{
"epoch": 2.3778385447628105,
"grad_norm": 0.8527488708496094,
"learning_rate": 4.405540363809297e-05,
"loss": 2.7851,
"step": 20000
},
{
"epoch": 2.437284508381881,
"grad_norm": 0.9439261555671692,
"learning_rate": 4.39067887290453e-05,
"loss": 2.7873,
"step": 20500
},
{
"epoch": 2.496730472000951,
"grad_norm": 0.9343836903572083,
"learning_rate": 4.3758173819997624e-05,
"loss": 2.7611,
"step": 21000
},
{
"epoch": 2.5561764356200216,
"grad_norm": 0.9050599932670593,
"learning_rate": 4.360955891094995e-05,
"loss": 2.767,
"step": 21500
},
{
"epoch": 2.6156223992390917,
"grad_norm": 0.9053699374198914,
"learning_rate": 4.346094400190227e-05,
"loss": 2.7873,
"step": 22000
},
{
"epoch": 2.6750683628581617,
"grad_norm": 0.9282116293907166,
"learning_rate": 4.33123290928546e-05,
"loss": 2.7607,
"step": 22500
},
{
"epoch": 2.7345143264772322,
"grad_norm": 0.9617480635643005,
"learning_rate": 4.316371418380692e-05,
"loss": 2.7678,
"step": 23000
},
{
"epoch": 2.7939602900963023,
"grad_norm": 0.9725137948989868,
"learning_rate": 4.3015099274759244e-05,
"loss": 2.7665,
"step": 23500
},
{
"epoch": 2.853406253715373,
"grad_norm": 0.9514666199684143,
"learning_rate": 4.286648436571157e-05,
"loss": 2.7534,
"step": 24000
},
{
"epoch": 2.912852217334443,
"grad_norm": 0.9485461115837097,
"learning_rate": 4.27178694566639e-05,
"loss": 2.7306,
"step": 24500
},
{
"epoch": 2.9722981809535134,
"grad_norm": 1.014106035232544,
"learning_rate": 4.256925454761622e-05,
"loss": 2.736,
"step": 25000
},
{
"epoch": 3.0317441445725835,
"grad_norm": 0.9117903113365173,
"learning_rate": 4.242063963856854e-05,
"loss": 2.7278,
"step": 25500
},
{
"epoch": 3.091190108191654,
"grad_norm": 0.8904880881309509,
"learning_rate": 4.2272024729520865e-05,
"loss": 2.7156,
"step": 26000
},
{
"epoch": 3.150636071810724,
"grad_norm": 0.8653568625450134,
"learning_rate": 4.2123409820473194e-05,
"loss": 2.7137,
"step": 26500
},
{
"epoch": 3.210082035429794,
"grad_norm": 0.9386480450630188,
"learning_rate": 4.197479491142551e-05,
"loss": 2.7021,
"step": 27000
},
{
"epoch": 3.2695279990488646,
"grad_norm": 1.0122427940368652,
"learning_rate": 4.182618000237784e-05,
"loss": 2.699,
"step": 27500
},
{
"epoch": 3.3289739626679347,
"grad_norm": 0.9319558143615723,
"learning_rate": 4.167756509333017e-05,
"loss": 2.689,
"step": 28000
},
{
"epoch": 3.388419926287005,
"grad_norm": 0.9281746745109558,
"learning_rate": 4.152895018428249e-05,
"loss": 2.7027,
"step": 28500
},
{
"epoch": 3.4478658899060752,
"grad_norm": 0.9750462770462036,
"learning_rate": 4.1380335275234815e-05,
"loss": 2.6947,
"step": 29000
},
{
"epoch": 3.5073118535251457,
"grad_norm": 0.8887720704078674,
"learning_rate": 4.123172036618714e-05,
"loss": 2.6864,
"step": 29500
},
{
"epoch": 3.566757817144216,
"grad_norm": 0.9884176254272461,
"learning_rate": 4.108310545713947e-05,
"loss": 2.6893,
"step": 30000
},
{
"epoch": 3.6262037807632863,
"grad_norm": 0.9995080828666687,
"learning_rate": 4.093449054809178e-05,
"loss": 2.6734,
"step": 30500
},
{
"epoch": 3.6856497443823564,
"grad_norm": 1.0068608522415161,
"learning_rate": 4.078587563904411e-05,
"loss": 2.6766,
"step": 31000
},
{
"epoch": 3.7450957080014264,
"grad_norm": 1.0225422382354736,
"learning_rate": 4.0637260729996435e-05,
"loss": 2.6757,
"step": 31500
},
{
"epoch": 3.804541671620497,
"grad_norm": 0.9354658126831055,
"learning_rate": 4.0488645820948765e-05,
"loss": 2.6593,
"step": 32000
},
{
"epoch": 3.8639876352395675,
"grad_norm": 0.9209592938423157,
"learning_rate": 4.034003091190108e-05,
"loss": 2.6547,
"step": 32500
},
{
"epoch": 3.9234335988586375,
"grad_norm": 0.8945015668869019,
"learning_rate": 4.019141600285341e-05,
"loss": 2.6719,
"step": 33000
},
{
"epoch": 3.9828795624777076,
"grad_norm": 0.9823748469352722,
"learning_rate": 4.004280109380573e-05,
"loss": 2.6781,
"step": 33500
},
{
"epoch": 4.042325526096778,
"grad_norm": 1.0186822414398193,
"learning_rate": 3.989418618475806e-05,
"loss": 2.6469,
"step": 34000
},
{
"epoch": 4.101771489715849,
"grad_norm": 0.9255732297897339,
"learning_rate": 3.974557127571038e-05,
"loss": 2.6296,
"step": 34500
},
{
"epoch": 4.161217453334919,
"grad_norm": 1.0235294103622437,
"learning_rate": 3.959695636666271e-05,
"loss": 2.6358,
"step": 35000
},
{
"epoch": 4.220663416953989,
"grad_norm": 0.911547064781189,
"learning_rate": 3.944834145761503e-05,
"loss": 2.6354,
"step": 35500
},
{
"epoch": 4.280109380573059,
"grad_norm": 1.0124516487121582,
"learning_rate": 3.929972654856735e-05,
"loss": 2.6416,
"step": 36000
},
{
"epoch": 4.33955534419213,
"grad_norm": 1.0222316980361938,
"learning_rate": 3.9151111639519676e-05,
"loss": 2.6188,
"step": 36500
},
{
"epoch": 4.3990013078112,
"grad_norm": 0.9710135459899902,
"learning_rate": 3.9002496730472005e-05,
"loss": 2.6228,
"step": 37000
},
{
"epoch": 4.45844727143027,
"grad_norm": 1.0287182331085205,
"learning_rate": 3.885388182142433e-05,
"loss": 2.6067,
"step": 37500
},
{
"epoch": 4.51789323504934,
"grad_norm": 0.9699456095695496,
"learning_rate": 3.870526691237665e-05,
"loss": 2.6385,
"step": 38000
},
{
"epoch": 4.57733919866841,
"grad_norm": 0.9066009521484375,
"learning_rate": 3.855665200332897e-05,
"loss": 2.6284,
"step": 38500
},
{
"epoch": 4.636785162287481,
"grad_norm": 0.8537769317626953,
"learning_rate": 3.84080370942813e-05,
"loss": 2.6135,
"step": 39000
},
{
"epoch": 4.696231125906551,
"grad_norm": 1.0666980743408203,
"learning_rate": 3.8259422185233626e-05,
"loss": 2.6312,
"step": 39500
},
{
"epoch": 4.755677089525621,
"grad_norm": 1.0641474723815918,
"learning_rate": 3.811080727618595e-05,
"loss": 2.6127,
"step": 40000
},
{
"epoch": 4.815123053144691,
"grad_norm": 1.076323390007019,
"learning_rate": 3.796219236713827e-05,
"loss": 2.6184,
"step": 40500
},
{
"epoch": 4.874569016763762,
"grad_norm": 0.8963558077812195,
"learning_rate": 3.78135774580906e-05,
"loss": 2.6165,
"step": 41000
},
{
"epoch": 4.934014980382832,
"grad_norm": 0.968908429145813,
"learning_rate": 3.766496254904292e-05,
"loss": 2.6009,
"step": 41500
},
{
"epoch": 4.993460944001902,
"grad_norm": 0.9362033605575562,
"learning_rate": 3.7516347639995246e-05,
"loss": 2.5956,
"step": 42000
},
{
"epoch": 5.052906907620972,
"grad_norm": 1.1101199388504028,
"learning_rate": 3.736773273094757e-05,
"loss": 2.5755,
"step": 42500
},
{
"epoch": 5.112352871240043,
"grad_norm": 1.2178868055343628,
"learning_rate": 3.72191178218999e-05,
"loss": 2.5724,
"step": 43000
},
{
"epoch": 5.171798834859113,
"grad_norm": 1.0143418312072754,
"learning_rate": 3.707050291285222e-05,
"loss": 2.5834,
"step": 43500
},
{
"epoch": 5.231244798478183,
"grad_norm": 0.9720271825790405,
"learning_rate": 3.6921888003804544e-05,
"loss": 2.586,
"step": 44000
},
{
"epoch": 5.290690762097253,
"grad_norm": 0.8847070932388306,
"learning_rate": 3.6773273094756866e-05,
"loss": 2.5953,
"step": 44500
},
{
"epoch": 5.3501367257163235,
"grad_norm": 0.9654759764671326,
"learning_rate": 3.6624658185709196e-05,
"loss": 2.5777,
"step": 45000
},
{
"epoch": 5.409582689335394,
"grad_norm": 0.9272730350494385,
"learning_rate": 3.647604327666151e-05,
"loss": 2.5774,
"step": 45500
},
{
"epoch": 5.4690286529544645,
"grad_norm": 0.9674676656723022,
"learning_rate": 3.632742836761384e-05,
"loss": 2.5779,
"step": 46000
},
{
"epoch": 5.528474616573535,
"grad_norm": 1.0238367319107056,
"learning_rate": 3.6178813458566164e-05,
"loss": 2.5683,
"step": 46500
},
{
"epoch": 5.587920580192605,
"grad_norm": 1.1663753986358643,
"learning_rate": 3.603019854951849e-05,
"loss": 2.5802,
"step": 47000
},
{
"epoch": 5.647366543811675,
"grad_norm": 0.8961432576179504,
"learning_rate": 3.588158364047081e-05,
"loss": 2.5726,
"step": 47500
},
{
"epoch": 5.706812507430746,
"grad_norm": 1.1115467548370361,
"learning_rate": 3.573296873142314e-05,
"loss": 2.5719,
"step": 48000
},
{
"epoch": 5.766258471049816,
"grad_norm": 1.00434148311615,
"learning_rate": 3.558435382237546e-05,
"loss": 2.556,
"step": 48500
},
{
"epoch": 5.825704434668886,
"grad_norm": 1.1120518445968628,
"learning_rate": 3.5435738913327784e-05,
"loss": 2.5627,
"step": 49000
},
{
"epoch": 5.885150398287957,
"grad_norm": 0.9611983299255371,
"learning_rate": 3.528712400428011e-05,
"loss": 2.5568,
"step": 49500
},
{
"epoch": 5.944596361907027,
"grad_norm": 1.1176481246948242,
"learning_rate": 3.5138509095232436e-05,
"loss": 2.5634,
"step": 50000
}
],
"logging_steps": 500,
"max_steps": 168220,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 2.090205609984e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}