doodle-dash2 / trainer_state.json
laszlokiss27's picture
Upload 8 files
5bd74a2 verified
raw
history blame
No virus
20.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 5000,
"global_step": 87895,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05688605722737357,
"grad_norm": 2.4062280654907227,
"learning_rate": 0.0007908982308436202,
"loss": 2.1918,
"step": 1000
},
{
"epoch": 0.11377211445474714,
"grad_norm": 1.431848406791687,
"learning_rate": 0.0007817964616872405,
"loss": 1.4818,
"step": 2000
},
{
"epoch": 0.17065817168212072,
"grad_norm": 1.5747077465057373,
"learning_rate": 0.0007726946925308607,
"loss": 1.3634,
"step": 3000
},
{
"epoch": 0.22754422890949427,
"grad_norm": 1.4864206314086914,
"learning_rate": 0.0007635929233744809,
"loss": 1.2967,
"step": 4000
},
{
"epoch": 0.2844302861368678,
"grad_norm": 1.2000905275344849,
"learning_rate": 0.0007544911542181011,
"loss": 1.2574,
"step": 5000
},
{
"epoch": 0.2844302861368678,
"eval_accuracy": 0.689128,
"eval_loss": 1.238457202911377,
"eval_runtime": 203.0197,
"eval_samples_per_second": 1231.407,
"eval_steps_per_second": 4.812,
"step": 5000
},
{
"epoch": 0.34131634336424144,
"grad_norm": 1.2910780906677246,
"learning_rate": 0.0007453893850617214,
"loss": 1.2181,
"step": 6000
},
{
"epoch": 0.398202400591615,
"grad_norm": 1.1383774280548096,
"learning_rate": 0.0007362876159053416,
"loss": 1.1863,
"step": 7000
},
{
"epoch": 0.45508845781898855,
"grad_norm": 1.135689616203308,
"learning_rate": 0.0007271858467489618,
"loss": 1.1653,
"step": 8000
},
{
"epoch": 0.5119745150463622,
"grad_norm": 1.1965036392211914,
"learning_rate": 0.0007180840775925821,
"loss": 1.147,
"step": 9000
},
{
"epoch": 0.5688605722737357,
"grad_norm": 1.0561026334762573,
"learning_rate": 0.0007089823084362024,
"loss": 1.1281,
"step": 10000
},
{
"epoch": 0.5688605722737357,
"eval_accuracy": 0.715764,
"eval_loss": 1.1192152500152588,
"eval_runtime": 128.8781,
"eval_samples_per_second": 1939.817,
"eval_steps_per_second": 7.581,
"step": 10000
},
{
"epoch": 0.6257466295011093,
"grad_norm": 0.9711835980415344,
"learning_rate": 0.0006998805392798226,
"loss": 1.1232,
"step": 11000
},
{
"epoch": 0.6826326867284829,
"grad_norm": 0.8913602828979492,
"learning_rate": 0.0006907787701234428,
"loss": 1.0988,
"step": 12000
},
{
"epoch": 0.7395187439558564,
"grad_norm": 1.092698097229004,
"learning_rate": 0.000681677000967063,
"loss": 1.0897,
"step": 13000
},
{
"epoch": 0.79640480118323,
"grad_norm": 0.9319038391113281,
"learning_rate": 0.0006725752318106833,
"loss": 1.0826,
"step": 14000
},
{
"epoch": 0.8532908584106036,
"grad_norm": 1.0223675966262817,
"learning_rate": 0.0006634734626543035,
"loss": 1.0698,
"step": 15000
},
{
"epoch": 0.8532908584106036,
"eval_accuracy": 0.728676,
"eval_loss": 1.0653605461120605,
"eval_runtime": 128.0826,
"eval_samples_per_second": 1951.866,
"eval_steps_per_second": 7.628,
"step": 15000
},
{
"epoch": 0.9101769156379771,
"grad_norm": 0.8995338678359985,
"learning_rate": 0.0006543716934979237,
"loss": 1.0624,
"step": 16000
},
{
"epoch": 0.9670629728653507,
"grad_norm": 0.8418471217155457,
"learning_rate": 0.0006452699243415439,
"loss": 1.0538,
"step": 17000
},
{
"epoch": 1.0239490300927243,
"grad_norm": 1.024624228477478,
"learning_rate": 0.0006361681551851641,
"loss": 1.0311,
"step": 18000
},
{
"epoch": 1.0808350873200978,
"grad_norm": 0.9130891561508179,
"learning_rate": 0.0006270663860287844,
"loss": 0.999,
"step": 19000
},
{
"epoch": 1.1377211445474713,
"grad_norm": 0.8896342515945435,
"learning_rate": 0.0006179646168724045,
"loss": 1.0,
"step": 20000
},
{
"epoch": 1.1377211445474713,
"eval_accuracy": 0.739712,
"eval_loss": 1.0235533714294434,
"eval_runtime": 127.2585,
"eval_samples_per_second": 1964.505,
"eval_steps_per_second": 7.677,
"step": 20000
},
{
"epoch": 1.194607201774845,
"grad_norm": 0.7940112948417664,
"learning_rate": 0.0006088628477160248,
"loss": 0.9957,
"step": 21000
},
{
"epoch": 1.2514932590022185,
"grad_norm": 0.9015308618545532,
"learning_rate": 0.000599761078559645,
"loss": 0.9967,
"step": 22000
},
{
"epoch": 1.3083793162295922,
"grad_norm": 0.9106078147888184,
"learning_rate": 0.0005906593094032653,
"loss": 0.9939,
"step": 23000
},
{
"epoch": 1.3652653734569657,
"grad_norm": 0.9563422203063965,
"learning_rate": 0.0005815575402468854,
"loss": 0.9931,
"step": 24000
},
{
"epoch": 1.4221514306843392,
"grad_norm": 0.7646272778511047,
"learning_rate": 0.0005724557710905057,
"loss": 0.9774,
"step": 25000
},
{
"epoch": 1.4221514306843392,
"eval_accuracy": 0.743348,
"eval_loss": 1.0054922103881836,
"eval_runtime": 127.7729,
"eval_samples_per_second": 1956.596,
"eval_steps_per_second": 7.646,
"step": 25000
},
{
"epoch": 1.4790374879117127,
"grad_norm": 0.7779045104980469,
"learning_rate": 0.000563354001934126,
"loss": 0.9792,
"step": 26000
},
{
"epoch": 1.5359235451390862,
"grad_norm": 0.8506484627723694,
"learning_rate": 0.0005542522327777463,
"loss": 0.9778,
"step": 27000
},
{
"epoch": 1.59280960236646,
"grad_norm": 0.8443676829338074,
"learning_rate": 0.0005451504636213664,
"loss": 0.9715,
"step": 28000
},
{
"epoch": 1.6496956595938337,
"grad_norm": 0.9333568215370178,
"learning_rate": 0.0005360486944649867,
"loss": 0.9679,
"step": 29000
},
{
"epoch": 1.7065817168212072,
"grad_norm": 0.9501623511314392,
"learning_rate": 0.0005269469253086069,
"loss": 0.9684,
"step": 30000
},
{
"epoch": 1.7065817168212072,
"eval_accuracy": 0.749276,
"eval_loss": 0.9812818765640259,
"eval_runtime": 128.5758,
"eval_samples_per_second": 1944.379,
"eval_steps_per_second": 7.599,
"step": 30000
},
{
"epoch": 1.7634677740485807,
"grad_norm": 0.7442188262939453,
"learning_rate": 0.0005178451561522272,
"loss": 0.9636,
"step": 31000
},
{
"epoch": 1.8203538312759542,
"grad_norm": 0.7510819435119629,
"learning_rate": 0.0005087433869958473,
"loss": 0.9647,
"step": 32000
},
{
"epoch": 1.8772398885033277,
"grad_norm": 0.7448764443397522,
"learning_rate": 0.0004996416178394676,
"loss": 0.9591,
"step": 33000
},
{
"epoch": 1.9341259457307014,
"grad_norm": 0.8019358515739441,
"learning_rate": 0.0004905398486830878,
"loss": 0.9513,
"step": 34000
},
{
"epoch": 1.9910120029580751,
"grad_norm": 0.9495121240615845,
"learning_rate": 0.00048143807952670797,
"loss": 0.9511,
"step": 35000
},
{
"epoch": 1.9910120029580751,
"eval_accuracy": 0.755448,
"eval_loss": 0.9558805227279663,
"eval_runtime": 127.8711,
"eval_samples_per_second": 1955.094,
"eval_steps_per_second": 7.641,
"step": 35000
},
{
"epoch": 2.0478980601854486,
"grad_norm": 0.8410281538963318,
"learning_rate": 0.00047233631037032825,
"loss": 0.9081,
"step": 36000
},
{
"epoch": 2.104784117412822,
"grad_norm": 0.8246123194694519,
"learning_rate": 0.00046323454121394847,
"loss": 0.8964,
"step": 37000
},
{
"epoch": 2.1616701746401956,
"grad_norm": 0.9567108154296875,
"learning_rate": 0.0004541327720575687,
"loss": 0.8952,
"step": 38000
},
{
"epoch": 2.218556231867569,
"grad_norm": 0.8104901313781738,
"learning_rate": 0.0004450310029011889,
"loss": 0.8925,
"step": 39000
},
{
"epoch": 2.2754422890949426,
"grad_norm": 0.9034276008605957,
"learning_rate": 0.0004359292337448092,
"loss": 0.8998,
"step": 40000
},
{
"epoch": 2.2754422890949426,
"eval_accuracy": 0.755948,
"eval_loss": 0.9492226839065552,
"eval_runtime": 127.8812,
"eval_samples_per_second": 1954.94,
"eval_steps_per_second": 7.64,
"step": 40000
},
{
"epoch": 2.3323283463223166,
"grad_norm": 1.3229442834854126,
"learning_rate": 0.00042682746458842937,
"loss": 0.8962,
"step": 41000
},
{
"epoch": 2.38921440354969,
"grad_norm": 0.8582925200462341,
"learning_rate": 0.00041772569543204965,
"loss": 0.8976,
"step": 42000
},
{
"epoch": 2.4461004607770636,
"grad_norm": 0.8881712555885315,
"learning_rate": 0.0004086239262756698,
"loss": 0.8898,
"step": 43000
},
{
"epoch": 2.502986518004437,
"grad_norm": 0.8713961839675903,
"learning_rate": 0.00039952215711929005,
"loss": 0.8927,
"step": 44000
},
{
"epoch": 2.5598725752318106,
"grad_norm": 0.7883007526397705,
"learning_rate": 0.00039042038796291027,
"loss": 0.8967,
"step": 45000
},
{
"epoch": 2.5598725752318106,
"eval_accuracy": 0.760028,
"eval_loss": 0.937300980091095,
"eval_runtime": 130.0782,
"eval_samples_per_second": 1921.921,
"eval_steps_per_second": 7.511,
"step": 45000
},
{
"epoch": 2.6167586324591845,
"grad_norm": 0.8600155711174011,
"learning_rate": 0.00038131861880653055,
"loss": 0.8927,
"step": 46000
},
{
"epoch": 2.673644689686558,
"grad_norm": 0.8501909971237183,
"learning_rate": 0.0003722168496501508,
"loss": 0.8913,
"step": 47000
},
{
"epoch": 2.7305307469139315,
"grad_norm": 0.8116582632064819,
"learning_rate": 0.000363115080493771,
"loss": 0.8889,
"step": 48000
},
{
"epoch": 2.787416804141305,
"grad_norm": 0.8065186738967896,
"learning_rate": 0.0003540133113373912,
"loss": 0.8896,
"step": 49000
},
{
"epoch": 2.8443028613686785,
"grad_norm": 0.9248031973838806,
"learning_rate": 0.00034491154218101145,
"loss": 0.8837,
"step": 50000
},
{
"epoch": 2.8443028613686785,
"eval_accuracy": 0.762176,
"eval_loss": 0.9251159429550171,
"eval_runtime": 128.4439,
"eval_samples_per_second": 1946.376,
"eval_steps_per_second": 7.606,
"step": 50000
},
{
"epoch": 2.901188918596052,
"grad_norm": 0.8191467523574829,
"learning_rate": 0.0003358097730246317,
"loss": 0.878,
"step": 51000
},
{
"epoch": 2.9580749758234255,
"grad_norm": 0.7620063424110413,
"learning_rate": 0.0003267080038682519,
"loss": 0.8832,
"step": 52000
},
{
"epoch": 3.0149610330507994,
"grad_norm": 0.8365482687950134,
"learning_rate": 0.0003176062347118721,
"loss": 0.8621,
"step": 53000
},
{
"epoch": 3.071847090278173,
"grad_norm": 0.9817807078361511,
"learning_rate": 0.00030850446555549235,
"loss": 0.8224,
"step": 54000
},
{
"epoch": 3.1287331475055464,
"grad_norm": 0.847806453704834,
"learning_rate": 0.00029940269639911263,
"loss": 0.8253,
"step": 55000
},
{
"epoch": 3.1287331475055464,
"eval_accuracy": 0.76438,
"eval_loss": 0.9235970973968506,
"eval_runtime": 126.2531,
"eval_samples_per_second": 1980.15,
"eval_steps_per_second": 7.738,
"step": 55000
},
{
"epoch": 3.18561920473292,
"grad_norm": 1.1729530096054077,
"learning_rate": 0.00029030092724273285,
"loss": 0.8225,
"step": 56000
},
{
"epoch": 3.2425052619602934,
"grad_norm": 1.0548408031463623,
"learning_rate": 0.0002811991580863531,
"loss": 0.821,
"step": 57000
},
{
"epoch": 3.299391319187667,
"grad_norm": 1.0199774503707886,
"learning_rate": 0.0002720973889299733,
"loss": 0.8213,
"step": 58000
},
{
"epoch": 3.356277376415041,
"grad_norm": 0.9180177450180054,
"learning_rate": 0.00026299561977359353,
"loss": 0.8274,
"step": 59000
},
{
"epoch": 3.4131634336424144,
"grad_norm": 0.9745663404464722,
"learning_rate": 0.0002538938506172137,
"loss": 0.8229,
"step": 60000
},
{
"epoch": 3.4131634336424144,
"eval_accuracy": 0.766832,
"eval_loss": 0.9138370156288147,
"eval_runtime": 129.2727,
"eval_samples_per_second": 1933.897,
"eval_steps_per_second": 7.558,
"step": 60000
},
{
"epoch": 3.470049490869788,
"grad_norm": 0.8708947896957397,
"learning_rate": 0.0002447920814608339,
"loss": 0.8256,
"step": 61000
},
{
"epoch": 3.5269355480971614,
"grad_norm": 0.9808185696601868,
"learning_rate": 0.00023569031230445418,
"loss": 0.8298,
"step": 62000
},
{
"epoch": 3.583821605324535,
"grad_norm": 0.8228833079338074,
"learning_rate": 0.0002265885431480744,
"loss": 0.827,
"step": 63000
},
{
"epoch": 3.6407076625519084,
"grad_norm": 0.9581019878387451,
"learning_rate": 0.00021748677399169463,
"loss": 0.8275,
"step": 64000
},
{
"epoch": 3.697593719779282,
"grad_norm": 0.8560314178466797,
"learning_rate": 0.00020838500483531488,
"loss": 0.8145,
"step": 65000
},
{
"epoch": 3.697593719779282,
"eval_accuracy": 0.769172,
"eval_loss": 0.9042648673057556,
"eval_runtime": 129.2138,
"eval_samples_per_second": 1934.778,
"eval_steps_per_second": 7.561,
"step": 65000
},
{
"epoch": 3.754479777006656,
"grad_norm": 0.8918451070785522,
"learning_rate": 0.0001992832356789351,
"loss": 0.819,
"step": 66000
},
{
"epoch": 3.8113658342340293,
"grad_norm": 1.0977294445037842,
"learning_rate": 0.00019018146652255533,
"loss": 0.8122,
"step": 67000
},
{
"epoch": 3.868251891461403,
"grad_norm": 0.7856444716453552,
"learning_rate": 0.00018107969736617555,
"loss": 0.8225,
"step": 68000
},
{
"epoch": 3.9251379486887763,
"grad_norm": 0.9270259141921997,
"learning_rate": 0.00017197792820979578,
"loss": 0.8158,
"step": 69000
},
{
"epoch": 3.98202400591615,
"grad_norm": 1.082774043083191,
"learning_rate": 0.00016287615905341603,
"loss": 0.8156,
"step": 70000
},
{
"epoch": 3.98202400591615,
"eval_accuracy": 0.770764,
"eval_loss": 0.8961142301559448,
"eval_runtime": 138.0555,
"eval_samples_per_second": 1810.866,
"eval_steps_per_second": 7.077,
"step": 70000
},
{
"epoch": 4.038910063143524,
"grad_norm": 0.909858226776123,
"learning_rate": 0.00015377438989703626,
"loss": 0.7785,
"step": 71000
},
{
"epoch": 4.095796120370897,
"grad_norm": 0.931280791759491,
"learning_rate": 0.00014467262074065645,
"loss": 0.7637,
"step": 72000
},
{
"epoch": 4.152682177598271,
"grad_norm": 0.94422847032547,
"learning_rate": 0.0001355708515842767,
"loss": 0.7612,
"step": 73000
},
{
"epoch": 4.209568234825644,
"grad_norm": 0.9250127077102661,
"learning_rate": 0.00012646908242789693,
"loss": 0.7616,
"step": 74000
},
{
"epoch": 4.266454292053018,
"grad_norm": 0.8467296957969666,
"learning_rate": 0.00011736731327151716,
"loss": 0.7557,
"step": 75000
},
{
"epoch": 4.266454292053018,
"eval_accuracy": 0.77204,
"eval_loss": 0.9022773504257202,
"eval_runtime": 144.5432,
"eval_samples_per_second": 1729.587,
"eval_steps_per_second": 6.759,
"step": 75000
},
{
"epoch": 4.323340349280391,
"grad_norm": 0.8985564708709717,
"learning_rate": 0.00010826554411513738,
"loss": 0.7604,
"step": 76000
},
{
"epoch": 4.380226406507765,
"grad_norm": 0.8618564605712891,
"learning_rate": 9.916377495875762e-05,
"loss": 0.7632,
"step": 77000
},
{
"epoch": 4.437112463735138,
"grad_norm": 0.9467126727104187,
"learning_rate": 9.006200580237784e-05,
"loss": 0.7614,
"step": 78000
},
{
"epoch": 4.493998520962512,
"grad_norm": 1.0163730382919312,
"learning_rate": 8.096023664599807e-05,
"loss": 0.7575,
"step": 79000
},
{
"epoch": 4.550884578189885,
"grad_norm": 1.1194038391113281,
"learning_rate": 7.18584674896183e-05,
"loss": 0.7595,
"step": 80000
},
{
"epoch": 4.550884578189885,
"eval_accuracy": 0.772256,
"eval_loss": 0.897346019744873,
"eval_runtime": 136.9434,
"eval_samples_per_second": 1825.571,
"eval_steps_per_second": 7.134,
"step": 80000
},
{
"epoch": 4.607770635417259,
"grad_norm": 1.0589629411697388,
"learning_rate": 6.275669833323853e-05,
"loss": 0.7548,
"step": 81000
},
{
"epoch": 4.664656692644633,
"grad_norm": 0.8540852665901184,
"learning_rate": 5.365492917685876e-05,
"loss": 0.7601,
"step": 82000
},
{
"epoch": 4.721542749872007,
"grad_norm": 1.127475380897522,
"learning_rate": 4.455316002047898e-05,
"loss": 0.7554,
"step": 83000
},
{
"epoch": 4.77842880709938,
"grad_norm": 0.9464063048362732,
"learning_rate": 3.545139086409921e-05,
"loss": 0.756,
"step": 84000
},
{
"epoch": 4.835314864326754,
"grad_norm": 0.9705914855003357,
"learning_rate": 2.634962170771944e-05,
"loss": 0.7581,
"step": 85000
},
{
"epoch": 4.835314864326754,
"eval_accuracy": 0.773724,
"eval_loss": 0.8925997018814087,
"eval_runtime": 138.7415,
"eval_samples_per_second": 1801.913,
"eval_steps_per_second": 7.042,
"step": 85000
},
{
"epoch": 4.892200921554127,
"grad_norm": 0.8879310488700867,
"learning_rate": 1.7247852551339668e-05,
"loss": 0.758,
"step": 86000
},
{
"epoch": 4.949086978781501,
"grad_norm": 1.2024400234222412,
"learning_rate": 8.146083394959896e-06,
"loss": 0.751,
"step": 87000
},
{
"epoch": 5.0,
"step": 87895,
"total_flos": 1.93274424e+18,
"train_loss": 0.9357909288237652,
"train_runtime": 45635.435,
"train_samples_per_second": 493.038,
"train_steps_per_second": 1.926
}
],
"logging_steps": 1000,
"max_steps": 87895,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 5000,
"total_flos": 1.93274424e+18,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}