HikariBloom-v0.3-RP / trainer_state.json
ikno
Upload folder using huggingface_hub
9da1c48 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.983277591973244,
"eval_steps": 37,
"global_step": 222,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013377926421404682,
"grad_norm": 7.5698137283325195,
"learning_rate": 1.3333333333333332e-06,
"loss": 2.1804,
"step": 1
},
{
"epoch": 0.026755852842809364,
"grad_norm": 7.3767266273498535,
"learning_rate": 2.6666666666666664e-06,
"loss": 2.1927,
"step": 2
},
{
"epoch": 0.04013377926421405,
"grad_norm": 7.829778671264648,
"learning_rate": 4e-06,
"loss": 2.3279,
"step": 3
},
{
"epoch": 0.05351170568561873,
"grad_norm": 2.793825626373291,
"learning_rate": 5.333333333333333e-06,
"loss": 1.9777,
"step": 4
},
{
"epoch": 0.06688963210702341,
"grad_norm": 1.4661837816238403,
"learning_rate": 6.666666666666667e-06,
"loss": 1.8485,
"step": 5
},
{
"epoch": 0.0802675585284281,
"grad_norm": 1.2292248010635376,
"learning_rate": 8e-06,
"loss": 1.9523,
"step": 6
},
{
"epoch": 0.09364548494983277,
"grad_norm": 1.240803599357605,
"learning_rate": 7.99957692770843e-06,
"loss": 1.9104,
"step": 7
},
{
"epoch": 0.10702341137123746,
"grad_norm": 0.8672861456871033,
"learning_rate": 7.998307800328803e-06,
"loss": 1.9006,
"step": 8
},
{
"epoch": 0.12040133779264214,
"grad_norm": 0.7724849581718445,
"learning_rate": 7.996192886327432e-06,
"loss": 1.8721,
"step": 9
},
{
"epoch": 0.13377926421404682,
"grad_norm": 0.6626549959182739,
"learning_rate": 7.993232633085074e-06,
"loss": 1.8403,
"step": 10
},
{
"epoch": 0.14715719063545152,
"grad_norm": 0.7850075364112854,
"learning_rate": 7.989427666802289e-06,
"loss": 1.8972,
"step": 11
},
{
"epoch": 0.1605351170568562,
"grad_norm": 0.6829317808151245,
"learning_rate": 7.984778792366982e-06,
"loss": 1.815,
"step": 12
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.5757900476455688,
"learning_rate": 7.979286993184132e-06,
"loss": 1.7474,
"step": 13
},
{
"epoch": 0.18729096989966554,
"grad_norm": 0.5840671062469482,
"learning_rate": 7.972953430967771e-06,
"loss": 1.872,
"step": 14
},
{
"epoch": 0.20066889632107024,
"grad_norm": 0.6052594780921936,
"learning_rate": 7.965779445495242e-06,
"loss": 1.7793,
"step": 15
},
{
"epoch": 0.2140468227424749,
"grad_norm": 0.5719857215881348,
"learning_rate": 7.957766554323777e-06,
"loss": 1.8001,
"step": 16
},
{
"epoch": 0.22742474916387959,
"grad_norm": 0.6494969129562378,
"learning_rate": 7.948916452469496e-06,
"loss": 1.8784,
"step": 17
},
{
"epoch": 0.2408026755852843,
"grad_norm": 0.5779961347579956,
"learning_rate": 7.939231012048832e-06,
"loss": 1.8493,
"step": 18
},
{
"epoch": 0.25418060200668896,
"grad_norm": 0.519511342048645,
"learning_rate": 7.928712281882523e-06,
"loss": 1.8679,
"step": 19
},
{
"epoch": 0.26755852842809363,
"grad_norm": 0.6307725310325623,
"learning_rate": 7.917362487062206e-06,
"loss": 1.8664,
"step": 20
},
{
"epoch": 0.2809364548494983,
"grad_norm": 0.521139919757843,
"learning_rate": 7.905184028479733e-06,
"loss": 1.7756,
"step": 21
},
{
"epoch": 0.29431438127090304,
"grad_norm": 0.5131444931030273,
"learning_rate": 7.892179482319294e-06,
"loss": 1.6563,
"step": 22
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.563713014125824,
"learning_rate": 7.878351599512464e-06,
"loss": 1.852,
"step": 23
},
{
"epoch": 0.3210702341137124,
"grad_norm": 0.5473136901855469,
"learning_rate": 7.863703305156273e-06,
"loss": 1.8271,
"step": 24
},
{
"epoch": 0.33444816053511706,
"grad_norm": 0.49893084168434143,
"learning_rate": 7.848237697894452e-06,
"loss": 1.7639,
"step": 25
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.47465410828590393,
"learning_rate": 7.831958049261955e-06,
"loss": 1.8612,
"step": 26
},
{
"epoch": 0.3612040133779264,
"grad_norm": 0.5295856595039368,
"learning_rate": 7.814867802992907e-06,
"loss": 1.819,
"step": 27
},
{
"epoch": 0.3745819397993311,
"grad_norm": 0.4863497316837311,
"learning_rate": 7.796970574292136e-06,
"loss": 1.7617,
"step": 28
},
{
"epoch": 0.3879598662207358,
"grad_norm": 0.5433112978935242,
"learning_rate": 7.778270149070419e-06,
"loss": 1.7289,
"step": 29
},
{
"epoch": 0.4013377926421405,
"grad_norm": 0.5282914638519287,
"learning_rate": 7.758770483143633e-06,
"loss": 1.8131,
"step": 30
},
{
"epoch": 0.41471571906354515,
"grad_norm": 0.5243386030197144,
"learning_rate": 7.738475701395954e-06,
"loss": 1.8339,
"step": 31
},
{
"epoch": 0.4280936454849498,
"grad_norm": 0.49295875430107117,
"learning_rate": 7.717390096907289e-06,
"loss": 1.8133,
"step": 32
},
{
"epoch": 0.4414715719063545,
"grad_norm": 0.5231158137321472,
"learning_rate": 7.695518130045147e-06,
"loss": 1.8031,
"step": 33
},
{
"epoch": 0.45484949832775917,
"grad_norm": 0.5049412250518799,
"learning_rate": 7.672864427521097e-06,
"loss": 1.7918,
"step": 34
},
{
"epoch": 0.4682274247491639,
"grad_norm": 0.5039061903953552,
"learning_rate": 7.649433781412057e-06,
"loss": 1.741,
"step": 35
},
{
"epoch": 0.4816053511705686,
"grad_norm": 0.5041850805282593,
"learning_rate": 7.6252311481465996e-06,
"loss": 1.7254,
"step": 36
},
{
"epoch": 0.49498327759197325,
"grad_norm": 0.4633885622024536,
"learning_rate": 7.600261647456484e-06,
"loss": 1.8132,
"step": 37
},
{
"epoch": 0.49498327759197325,
"eval_loss": 0.667718768119812,
"eval_runtime": 13.4399,
"eval_samples_per_second": 90.626,
"eval_steps_per_second": 5.729,
"step": 37
},
{
"epoch": 0.5083612040133779,
"grad_norm": 0.5181726217269897,
"learning_rate": 7.574530561293649e-06,
"loss": 1.882,
"step": 38
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.5037977695465088,
"learning_rate": 7.548043332712886e-06,
"loss": 1.8253,
"step": 39
},
{
"epoch": 0.5351170568561873,
"grad_norm": 0.4691613018512726,
"learning_rate": 7.520805564720443e-06,
"loss": 1.7016,
"step": 40
},
{
"epoch": 0.5484949832775919,
"grad_norm": 0.4761461019515991,
"learning_rate": 7.492823019088783e-06,
"loss": 1.8041,
"step": 41
},
{
"epoch": 0.5618729096989966,
"grad_norm": 0.4626379907131195,
"learning_rate": 7.4641016151377545e-06,
"loss": 1.7852,
"step": 42
},
{
"epoch": 0.5752508361204013,
"grad_norm": 0.49921584129333496,
"learning_rate": 7.434647428482453e-06,
"loss": 1.7104,
"step": 43
},
{
"epoch": 0.5886287625418061,
"grad_norm": 0.49447470903396606,
"learning_rate": 7.4044666897479985e-06,
"loss": 1.7973,
"step": 44
},
{
"epoch": 0.6020066889632107,
"grad_norm": 0.4844650328159332,
"learning_rate": 7.373565783251543e-06,
"loss": 1.7678,
"step": 45
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.49107274413108826,
"learning_rate": 7.3419512456517455e-06,
"loss": 1.718,
"step": 46
},
{
"epoch": 0.6287625418060201,
"grad_norm": 0.49630844593048096,
"learning_rate": 7.309629764566041e-06,
"loss": 1.802,
"step": 47
},
{
"epoch": 0.6421404682274248,
"grad_norm": 0.47247716784477234,
"learning_rate": 7.276608177155967e-06,
"loss": 1.7803,
"step": 48
},
{
"epoch": 0.6555183946488294,
"grad_norm": 1.2681446075439453,
"learning_rate": 7.242893468680849e-06,
"loss": 1.768,
"step": 49
},
{
"epoch": 0.6688963210702341,
"grad_norm": 0.5203042030334473,
"learning_rate": 7.208492771020175e-06,
"loss": 1.8885,
"step": 50
},
{
"epoch": 0.6822742474916388,
"grad_norm": 0.5055291056632996,
"learning_rate": 7.1734133611649405e-06,
"loss": 1.812,
"step": 51
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.5043210387229919,
"learning_rate": 7.137662659678303e-06,
"loss": 1.8291,
"step": 52
},
{
"epoch": 0.7090301003344481,
"grad_norm": 0.5000115633010864,
"learning_rate": 7.1012482291258626e-06,
"loss": 1.7115,
"step": 53
},
{
"epoch": 0.7224080267558528,
"grad_norm": 0.5015853643417358,
"learning_rate": 7.064177772475912e-06,
"loss": 1.8441,
"step": 54
},
{
"epoch": 0.7357859531772575,
"grad_norm": 0.5204277038574219,
"learning_rate": 7.026459131469972e-06,
"loss": 1.8268,
"step": 55
},
{
"epoch": 0.7491638795986622,
"grad_norm": 0.5002334117889404,
"learning_rate": 6.9881002849639835e-06,
"loss": 1.7633,
"step": 56
},
{
"epoch": 0.7625418060200669,
"grad_norm": 0.47437921166419983,
"learning_rate": 6.949109347240496e-06,
"loss": 1.7573,
"step": 57
},
{
"epoch": 0.7759197324414716,
"grad_norm": 0.46609270572662354,
"learning_rate": 6.909494566292195e-06,
"loss": 1.7671,
"step": 58
},
{
"epoch": 0.7892976588628763,
"grad_norm": 0.464740514755249,
"learning_rate": 6.869264322077157e-06,
"loss": 1.735,
"step": 59
},
{
"epoch": 0.802675585284281,
"grad_norm": 0.44125059247016907,
"learning_rate": 6.82842712474619e-06,
"loss": 1.6895,
"step": 60
},
{
"epoch": 0.8160535117056856,
"grad_norm": 0.4398334324359894,
"learning_rate": 6.786991612842619e-06,
"loss": 1.6622,
"step": 61
},
{
"epoch": 0.8294314381270903,
"grad_norm": 0.5073325037956238,
"learning_rate": 6.744966551474935e-06,
"loss": 1.7893,
"step": 62
},
{
"epoch": 0.842809364548495,
"grad_norm": 0.4685400128364563,
"learning_rate": 6.702360830462641e-06,
"loss": 1.7377,
"step": 63
},
{
"epoch": 0.8561872909698997,
"grad_norm": 0.5076963305473328,
"learning_rate": 6.65918346245575e-06,
"loss": 1.8125,
"step": 64
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.510735273361206,
"learning_rate": 6.615443581028279e-06,
"loss": 1.8576,
"step": 65
},
{
"epoch": 0.882943143812709,
"grad_norm": 0.489524781703949,
"learning_rate": 6.571150438746157e-06,
"loss": 1.6699,
"step": 66
},
{
"epoch": 0.8963210702341137,
"grad_norm": 0.4763016402721405,
"learning_rate": 6.5263134052099895e-06,
"loss": 1.7561,
"step": 67
},
{
"epoch": 0.9096989966555183,
"grad_norm": 0.4900319278240204,
"learning_rate": 6.480941965073039e-06,
"loss": 1.7364,
"step": 68
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.4876985549926758,
"learning_rate": 6.435045716034882e-06,
"loss": 1.7202,
"step": 69
},
{
"epoch": 0.9364548494983278,
"grad_norm": 0.5038361549377441,
"learning_rate": 6.388634366811145e-06,
"loss": 1.7732,
"step": 70
},
{
"epoch": 0.9498327759197325,
"grad_norm": 0.5064495801925659,
"learning_rate": 6.341717735079762e-06,
"loss": 1.7221,
"step": 71
},
{
"epoch": 0.9632107023411371,
"grad_norm": 0.48319771885871887,
"learning_rate": 6.294305745404184e-06,
"loss": 1.7169,
"step": 72
},
{
"epoch": 0.9765886287625418,
"grad_norm": 0.4659038782119751,
"learning_rate": 6.246408427133971e-06,
"loss": 1.7655,
"step": 73
},
{
"epoch": 0.9899665551839465,
"grad_norm": 0.5067590475082397,
"learning_rate": 6.198035912283224e-06,
"loss": 1.7013,
"step": 74
},
{
"epoch": 0.9899665551839465,
"eval_loss": 0.6622208952903748,
"eval_runtime": 13.4118,
"eval_samples_per_second": 90.815,
"eval_steps_per_second": 5.741,
"step": 74
},
{
"epoch": 1.0100334448160535,
"grad_norm": 1.075007438659668,
"learning_rate": 6.149198433387296e-06,
"loss": 2.8527,
"step": 75
},
{
"epoch": 1.0234113712374582,
"grad_norm": 0.5882135629653931,
"learning_rate": 6.09990632133824e-06,
"loss": 1.2937,
"step": 76
},
{
"epoch": 1.0367892976588629,
"grad_norm": 0.5747379064559937,
"learning_rate": 6.050170003199461e-06,
"loss": 1.3659,
"step": 77
},
{
"epoch": 1.0501672240802675,
"grad_norm": 0.5323824286460876,
"learning_rate": 6e-06,
"loss": 1.4013,
"step": 78
},
{
"epoch": 1.0635451505016722,
"grad_norm": 0.5771108865737915,
"learning_rate": 5.94940692450897e-06,
"loss": 1.4312,
"step": 79
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.6475838422775269,
"learning_rate": 5.898401478990562e-06,
"loss": 1.5826,
"step": 80
},
{
"epoch": 1.0903010033444815,
"grad_norm": 0.592047929763794,
"learning_rate": 5.846994452940136e-06,
"loss": 1.4368,
"step": 81
},
{
"epoch": 1.1036789297658862,
"grad_norm": 0.7535114884376526,
"learning_rate": 5.795196720801849e-06,
"loss": 1.5116,
"step": 82
},
{
"epoch": 1.117056856187291,
"grad_norm": 0.586352527141571,
"learning_rate": 5.743019239668317e-06,
"loss": 1.4519,
"step": 83
},
{
"epoch": 1.1304347826086956,
"grad_norm": 0.617953360080719,
"learning_rate": 5.690473046962798e-06,
"loss": 1.516,
"step": 84
},
{
"epoch": 1.1438127090301002,
"grad_norm": 0.5882145166397095,
"learning_rate": 5.63756925810437e-06,
"loss": 1.3272,
"step": 85
},
{
"epoch": 1.1571906354515051,
"grad_norm": 0.5454166531562805,
"learning_rate": 5.584319064156627e-06,
"loss": 1.4443,
"step": 86
},
{
"epoch": 1.1705685618729098,
"grad_norm": 0.5162122249603271,
"learning_rate": 5.530733729460359e-06,
"loss": 1.409,
"step": 87
},
{
"epoch": 1.1839464882943145,
"grad_norm": 0.5802710056304932,
"learning_rate": 5.476824589250738e-06,
"loss": 1.4925,
"step": 88
},
{
"epoch": 1.1973244147157192,
"grad_norm": 0.5536505579948425,
"learning_rate": 5.4226030472595064e-06,
"loss": 1.3556,
"step": 89
},
{
"epoch": 1.2107023411371238,
"grad_norm": 0.5804270505905151,
"learning_rate": 5.368080573302675e-06,
"loss": 1.4867,
"step": 90
},
{
"epoch": 1.2240802675585285,
"grad_norm": 0.5988459587097168,
"learning_rate": 5.3132687008542446e-06,
"loss": 1.5466,
"step": 91
},
{
"epoch": 1.2374581939799332,
"grad_norm": 0.5100234150886536,
"learning_rate": 5.2581790246064545e-06,
"loss": 1.3042,
"step": 92
},
{
"epoch": 1.2508361204013378,
"grad_norm": 0.547820508480072,
"learning_rate": 5.2028231980170915e-06,
"loss": 1.4344,
"step": 93
},
{
"epoch": 1.2642140468227425,
"grad_norm": 0.5059947967529297,
"learning_rate": 5.147212930844361e-06,
"loss": 1.3313,
"step": 94
},
{
"epoch": 1.2775919732441472,
"grad_norm": 0.5130128860473633,
"learning_rate": 5.091359986669844e-06,
"loss": 1.3825,
"step": 95
},
{
"epoch": 1.2909698996655519,
"grad_norm": 0.5295710563659668,
"learning_rate": 5.035276180410083e-06,
"loss": 1.3594,
"step": 96
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.5060495734214783,
"learning_rate": 4.978973375817295e-06,
"loss": 1.3036,
"step": 97
},
{
"epoch": 1.3177257525083612,
"grad_norm": 0.5988255143165588,
"learning_rate": 4.922463482969761e-06,
"loss": 1.5651,
"step": 98
},
{
"epoch": 1.3311036789297659,
"grad_norm": 0.4946533739566803,
"learning_rate": 4.8657584557524116e-06,
"loss": 1.333,
"step": 99
},
{
"epoch": 1.3444816053511706,
"grad_norm": 0.5674816966056824,
"learning_rate": 4.808870289328152e-06,
"loss": 1.4971,
"step": 100
},
{
"epoch": 1.3578595317725752,
"grad_norm": 0.5148370265960693,
"learning_rate": 4.751811017600447e-06,
"loss": 1.4789,
"step": 101
},
{
"epoch": 1.37123745819398,
"grad_norm": 0.5132340788841248,
"learning_rate": 4.694592710667722e-06,
"loss": 1.3163,
"step": 102
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.6035396456718445,
"learning_rate": 4.637227472270091e-06,
"loss": 1.3789,
"step": 103
},
{
"epoch": 1.3979933110367893,
"grad_norm": 0.550737738609314,
"learning_rate": 4.579727437228986e-06,
"loss": 1.4218,
"step": 104
},
{
"epoch": 1.411371237458194,
"grad_norm": 0.4987228810787201,
"learning_rate": 4.522104768880207e-06,
"loss": 1.264,
"step": 105
},
{
"epoch": 1.4247491638795986,
"grad_norm": 0.5271551609039307,
"learning_rate": 4.4643716565009205e-06,
"loss": 1.4445,
"step": 106
},
{
"epoch": 1.4381270903010033,
"grad_norm": 0.5551120638847351,
"learning_rate": 4.406540312731208e-06,
"loss": 1.5199,
"step": 107
},
{
"epoch": 1.451505016722408,
"grad_norm": 0.5053355097770691,
"learning_rate": 4.348622970990633e-06,
"loss": 1.3389,
"step": 108
},
{
"epoch": 1.4648829431438126,
"grad_norm": 0.5177690386772156,
"learning_rate": 4.290631882890443e-06,
"loss": 1.4396,
"step": 109
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.522657573223114,
"learning_rate": 4.232579315641903e-06,
"loss": 1.5001,
"step": 110
},
{
"epoch": 1.491638795986622,
"grad_norm": 0.5218788981437683,
"learning_rate": 4.174477549461344e-06,
"loss": 1.3964,
"step": 111
},
{
"epoch": 1.491638795986622,
"eval_loss": 0.6926424503326416,
"eval_runtime": 13.4107,
"eval_samples_per_second": 90.823,
"eval_steps_per_second": 5.742,
"step": 111
},
{
"epoch": 1.5050167224080266,
"grad_norm": 0.513609766960144,
"learning_rate": 4.1163388749724456e-06,
"loss": 1.3159,
"step": 112
},
{
"epoch": 1.5183946488294313,
"grad_norm": 0.5356954336166382,
"learning_rate": 4.058175590606331e-06,
"loss": 1.4034,
"step": 113
},
{
"epoch": 1.531772575250836,
"grad_norm": 0.5953348278999329,
"learning_rate": 4e-06,
"loss": 1.4772,
"step": 114
},
{
"epoch": 1.5451505016722407,
"grad_norm": 0.5957881808280945,
"learning_rate": 3.941824409393669e-06,
"loss": 1.5237,
"step": 115
},
{
"epoch": 1.5585284280936453,
"grad_norm": 0.4759249687194824,
"learning_rate": 3.883661125027554e-06,
"loss": 1.27,
"step": 116
},
{
"epoch": 1.57190635451505,
"grad_norm": 0.5109943747520447,
"learning_rate": 3.825522450538656e-06,
"loss": 1.4649,
"step": 117
},
{
"epoch": 1.585284280936455,
"grad_norm": 0.477067768573761,
"learning_rate": 3.7674206843580965e-06,
"loss": 1.3081,
"step": 118
},
{
"epoch": 1.5986622073578596,
"grad_norm": 0.505376398563385,
"learning_rate": 3.7093681171095572e-06,
"loss": 1.3395,
"step": 119
},
{
"epoch": 1.6120401337792643,
"grad_norm": 0.5487738251686096,
"learning_rate": 3.651377029009367e-06,
"loss": 1.529,
"step": 120
},
{
"epoch": 1.625418060200669,
"grad_norm": 0.5177662968635559,
"learning_rate": 3.5934596872687923e-06,
"loss": 1.291,
"step": 121
},
{
"epoch": 1.6387959866220736,
"grad_norm": 0.5494332909584045,
"learning_rate": 3.5356283434990783e-06,
"loss": 1.4541,
"step": 122
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.526679277420044,
"learning_rate": 3.4778952311197945e-06,
"loss": 1.3409,
"step": 123
},
{
"epoch": 1.665551839464883,
"grad_norm": 0.5405285954475403,
"learning_rate": 3.4202725627710133e-06,
"loss": 1.4129,
"step": 124
},
{
"epoch": 1.6789297658862876,
"grad_norm": 0.5305699110031128,
"learning_rate": 3.36277252772991e-06,
"loss": 1.458,
"step": 125
},
{
"epoch": 1.6923076923076923,
"grad_norm": 0.5097222924232483,
"learning_rate": 3.3054072893322785e-06,
"loss": 1.3555,
"step": 126
},
{
"epoch": 1.705685618729097,
"grad_norm": 0.5460465550422668,
"learning_rate": 3.2481889823995524e-06,
"loss": 1.3272,
"step": 127
},
{
"epoch": 1.7190635451505016,
"grad_norm": 0.5369409918785095,
"learning_rate": 3.191129710671849e-06,
"loss": 1.2991,
"step": 128
},
{
"epoch": 1.7324414715719063,
"grad_norm": 0.4934872090816498,
"learning_rate": 3.1342415442475885e-06,
"loss": 1.319,
"step": 129
},
{
"epoch": 1.745819397993311,
"grad_norm": 0.5528122186660767,
"learning_rate": 3.077536517030239e-06,
"loss": 1.4731,
"step": 130
},
{
"epoch": 1.7591973244147159,
"grad_norm": 0.5233715176582336,
"learning_rate": 3.0210266241827046e-06,
"loss": 1.4089,
"step": 131
},
{
"epoch": 1.7725752508361206,
"grad_norm": 0.5456512570381165,
"learning_rate": 2.9647238195899164e-06,
"loss": 1.4056,
"step": 132
},
{
"epoch": 1.7859531772575252,
"grad_norm": 0.5461183190345764,
"learning_rate": 2.908640013330157e-06,
"loss": 1.4384,
"step": 133
},
{
"epoch": 1.79933110367893,
"grad_norm": 0.5198376178741455,
"learning_rate": 2.85278706915564e-06,
"loss": 1.48,
"step": 134
},
{
"epoch": 1.8127090301003346,
"grad_norm": 0.5073018670082092,
"learning_rate": 2.7971768019829077e-06,
"loss": 1.3335,
"step": 135
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.5290614366531372,
"learning_rate": 2.741820975393546e-06,
"loss": 1.4239,
"step": 136
},
{
"epoch": 1.839464882943144,
"grad_norm": 0.5525597333908081,
"learning_rate": 2.686731299145756e-06,
"loss": 1.4017,
"step": 137
},
{
"epoch": 1.8528428093645486,
"grad_norm": 0.5554612874984741,
"learning_rate": 2.631919426697325e-06,
"loss": 1.5295,
"step": 138
},
{
"epoch": 1.8662207357859533,
"grad_norm": 0.5276882648468018,
"learning_rate": 2.5773969527404945e-06,
"loss": 1.3571,
"step": 139
},
{
"epoch": 1.879598662207358,
"grad_norm": 0.506417453289032,
"learning_rate": 2.5231754107492627e-06,
"loss": 1.3666,
"step": 140
},
{
"epoch": 1.8929765886287626,
"grad_norm": 0.535830557346344,
"learning_rate": 2.469266270539641e-06,
"loss": 1.5119,
"step": 141
},
{
"epoch": 1.9063545150501673,
"grad_norm": 0.5409619808197021,
"learning_rate": 2.4156809358433725e-06,
"loss": 1.4349,
"step": 142
},
{
"epoch": 1.919732441471572,
"grad_norm": 0.5141175985336304,
"learning_rate": 2.3624307418956294e-06,
"loss": 1.3672,
"step": 143
},
{
"epoch": 1.9331103678929766,
"grad_norm": 0.5471431612968445,
"learning_rate": 2.309526953037203e-06,
"loss": 1.3575,
"step": 144
},
{
"epoch": 1.9464882943143813,
"grad_norm": 0.5435221195220947,
"learning_rate": 2.256980760331683e-06,
"loss": 1.4398,
"step": 145
},
{
"epoch": 1.959866220735786,
"grad_norm": 0.5480269193649292,
"learning_rate": 2.2048032791981513e-06,
"loss": 1.279,
"step": 146
},
{
"epoch": 1.9732441471571907,
"grad_norm": 0.5423163175582886,
"learning_rate": 2.153005547059865e-06,
"loss": 1.4763,
"step": 147
},
{
"epoch": 1.9866220735785953,
"grad_norm": 0.5127543807029724,
"learning_rate": 2.1015985210094384e-06,
"loss": 1.3808,
"step": 148
},
{
"epoch": 1.9866220735785953,
"eval_loss": 0.7046768069267273,
"eval_runtime": 13.4302,
"eval_samples_per_second": 90.691,
"eval_steps_per_second": 5.733,
"step": 148
},
{
"epoch": 2.0066889632107023,
"grad_norm": 1.2160006761550903,
"learning_rate": 2.050593075491031e-06,
"loss": 2.3417,
"step": 149
},
{
"epoch": 2.020066889632107,
"grad_norm": 0.7894352674484253,
"learning_rate": 2.0000000000000008e-06,
"loss": 1.1381,
"step": 150
},
{
"epoch": 2.0334448160535117,
"grad_norm": 0.790090799331665,
"learning_rate": 1.9498299968005392e-06,
"loss": 1.2723,
"step": 151
},
{
"epoch": 2.0468227424749164,
"grad_norm": 0.6597563624382019,
"learning_rate": 1.9000936786617597e-06,
"loss": 1.1166,
"step": 152
},
{
"epoch": 2.060200668896321,
"grad_norm": 0.6646838784217834,
"learning_rate": 1.850801566612704e-06,
"loss": 1.2064,
"step": 153
},
{
"epoch": 2.0735785953177257,
"grad_norm": 0.616726815700531,
"learning_rate": 1.801964087716776e-06,
"loss": 1.1856,
"step": 154
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.6225292682647705,
"learning_rate": 1.7535915728660289e-06,
"loss": 1.0631,
"step": 155
},
{
"epoch": 2.100334448160535,
"grad_norm": 0.9003037214279175,
"learning_rate": 1.7056942545958167e-06,
"loss": 1.1332,
"step": 156
},
{
"epoch": 2.1137123745819397,
"grad_norm": 1.1400443315505981,
"learning_rate": 1.6582822649202379e-06,
"loss": 1.0256,
"step": 157
},
{
"epoch": 2.1270903010033444,
"grad_norm": 1.0252878665924072,
"learning_rate": 1.611365633188856e-06,
"loss": 1.0825,
"step": 158
},
{
"epoch": 2.140468227424749,
"grad_norm": 0.785536527633667,
"learning_rate": 1.5649542839651173e-06,
"loss": 1.1539,
"step": 159
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.6228716373443604,
"learning_rate": 1.5190580349269603e-06,
"loss": 1.1693,
"step": 160
},
{
"epoch": 2.1672240802675584,
"grad_norm": 0.605522096157074,
"learning_rate": 1.4736865947900103e-06,
"loss": 1.0761,
"step": 161
},
{
"epoch": 2.180602006688963,
"grad_norm": 0.6409484148025513,
"learning_rate": 1.4288495612538425e-06,
"loss": 1.1637,
"step": 162
},
{
"epoch": 2.1939799331103678,
"grad_norm": 0.6087141036987305,
"learning_rate": 1.3845564189717216e-06,
"loss": 1.0937,
"step": 163
},
{
"epoch": 2.2073578595317724,
"grad_norm": 0.7026439309120178,
"learning_rate": 1.3408165375442484e-06,
"loss": 1.2132,
"step": 164
},
{
"epoch": 2.220735785953177,
"grad_norm": 0.6594187617301941,
"learning_rate": 1.297639169537359e-06,
"loss": 1.07,
"step": 165
},
{
"epoch": 2.234113712374582,
"grad_norm": 0.6606442928314209,
"learning_rate": 1.255033448525066e-06,
"loss": 1.1694,
"step": 166
},
{
"epoch": 2.2474916387959865,
"grad_norm": 0.6164308786392212,
"learning_rate": 1.2130083871573812e-06,
"loss": 1.0824,
"step": 167
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.5823544263839722,
"learning_rate": 1.1715728752538101e-06,
"loss": 1.1106,
"step": 168
},
{
"epoch": 2.274247491638796,
"grad_norm": 0.5872677564620972,
"learning_rate": 1.130735677922842e-06,
"loss": 1.1056,
"step": 169
},
{
"epoch": 2.2876254180602005,
"grad_norm": 0.6060868501663208,
"learning_rate": 1.090505433707805e-06,
"loss": 1.0993,
"step": 170
},
{
"epoch": 2.3010033444816056,
"grad_norm": 0.7028762698173523,
"learning_rate": 1.050890652759504e-06,
"loss": 1.3655,
"step": 171
},
{
"epoch": 2.3143812709030103,
"grad_norm": 0.6295290589332581,
"learning_rate": 1.0118997150360166e-06,
"loss": 1.0931,
"step": 172
},
{
"epoch": 2.327759197324415,
"grad_norm": 0.6583987474441528,
"learning_rate": 9.735408685300286e-07,
"loss": 1.1103,
"step": 173
},
{
"epoch": 2.3411371237458196,
"grad_norm": 0.7007333040237427,
"learning_rate": 9.358222275240884e-07,
"loss": 1.1794,
"step": 174
},
{
"epoch": 2.3545150501672243,
"grad_norm": 0.6823887825012207,
"learning_rate": 8.987517708741363e-07,
"loss": 1.1575,
"step": 175
},
{
"epoch": 2.367892976588629,
"grad_norm": 0.680305540561676,
"learning_rate": 8.623373403216971e-07,
"loss": 1.096,
"step": 176
},
{
"epoch": 2.3812709030100336,
"grad_norm": 0.6234930157661438,
"learning_rate": 8.265866388350598e-07,
"loss": 1.0486,
"step": 177
},
{
"epoch": 2.3946488294314383,
"grad_norm": 0.6473740339279175,
"learning_rate": 7.915072289798246e-07,
"loss": 1.1637,
"step": 178
},
{
"epoch": 2.408026755852843,
"grad_norm": 0.6634021997451782,
"learning_rate": 7.571065313191511e-07,
"loss": 1.1053,
"step": 179
},
{
"epoch": 2.4214046822742477,
"grad_norm": 0.6495158672332764,
"learning_rate": 7.233918228440323e-07,
"loss": 1.0907,
"step": 180
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.6720609664916992,
"learning_rate": 6.903702354339578e-07,
"loss": 1.1751,
"step": 181
},
{
"epoch": 2.448160535117057,
"grad_norm": 0.6688068509101868,
"learning_rate": 6.580487543482549e-07,
"loss": 1.1408,
"step": 182
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.6397896409034729,
"learning_rate": 6.26434216748458e-07,
"loss": 1.2012,
"step": 183
},
{
"epoch": 2.4749163879598663,
"grad_norm": 0.6841992735862732,
"learning_rate": 5.955333102520011e-07,
"loss": 1.2623,
"step": 184
},
{
"epoch": 2.488294314381271,
"grad_norm": 0.6013959050178528,
"learning_rate": 5.653525715175483e-07,
"loss": 1.0792,
"step": 185
},
{
"epoch": 2.488294314381271,
"eval_loss": 0.8052845597267151,
"eval_runtime": 13.448,
"eval_samples_per_second": 90.571,
"eval_steps_per_second": 5.726,
"step": 185
},
{
"epoch": 2.5016722408026757,
"grad_norm": 0.595483660697937,
"learning_rate": 5.358983848622451e-07,
"loss": 1.1536,
"step": 186
},
{
"epoch": 2.5150501672240804,
"grad_norm": 0.6301653981208801,
"learning_rate": 5.07176980911217e-07,
"loss": 1.1543,
"step": 187
},
{
"epoch": 2.528428093645485,
"grad_norm": 0.6083581447601318,
"learning_rate": 4.791944352795561e-07,
"loss": 1.131,
"step": 188
},
{
"epoch": 2.5418060200668897,
"grad_norm": 0.6019948720932007,
"learning_rate": 4.519566672871131e-07,
"loss": 1.1022,
"step": 189
},
{
"epoch": 2.5551839464882944,
"grad_norm": 0.5989395976066589,
"learning_rate": 4.2546943870635135e-07,
"loss": 1.1402,
"step": 190
},
{
"epoch": 2.568561872909699,
"grad_norm": 0.575457751750946,
"learning_rate": 3.997383525435154e-07,
"loss": 1.0687,
"step": 191
},
{
"epoch": 2.5819397993311037,
"grad_norm": 0.6226676106452942,
"learning_rate": 3.7476885185340023e-07,
"loss": 1.158,
"step": 192
},
{
"epoch": 2.5953177257525084,
"grad_norm": 0.6265813112258911,
"learning_rate": 3.5056621858794387e-07,
"loss": 1.1689,
"step": 193
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.6149746775627136,
"learning_rate": 3.2713557247890447e-07,
"loss": 1.1482,
"step": 194
},
{
"epoch": 2.6220735785953178,
"grad_norm": 0.555928647518158,
"learning_rate": 3.0448186995485303e-07,
"loss": 0.9814,
"step": 195
},
{
"epoch": 2.6354515050167224,
"grad_norm": 0.6666916608810425,
"learning_rate": 2.826099030927098e-07,
"loss": 1.2773,
"step": 196
},
{
"epoch": 2.648829431438127,
"grad_norm": 0.6038864850997925,
"learning_rate": 2.6152429860404646e-07,
"loss": 1.0263,
"step": 197
},
{
"epoch": 2.6622073578595318,
"grad_norm": 0.6544002890586853,
"learning_rate": 2.412295168563667e-07,
"loss": 1.2501,
"step": 198
},
{
"epoch": 2.6755852842809364,
"grad_norm": 0.5613058805465698,
"learning_rate": 2.2172985092958128e-07,
"loss": 1.0164,
"step": 199
},
{
"epoch": 2.688963210702341,
"grad_norm": 0.6110493540763855,
"learning_rate": 2.0302942570786442e-07,
"loss": 1.142,
"step": 200
},
{
"epoch": 2.702341137123746,
"grad_norm": 0.6497470140457153,
"learning_rate": 1.851321970070927e-07,
"loss": 1.1498,
"step": 201
},
{
"epoch": 2.7157190635451505,
"grad_norm": 0.6384419202804565,
"learning_rate": 1.680419507380444e-07,
"loss": 1.1044,
"step": 202
},
{
"epoch": 2.729096989966555,
"grad_norm": 0.6009129285812378,
"learning_rate": 1.5176230210554742e-07,
"loss": 1.13,
"step": 203
},
{
"epoch": 2.74247491638796,
"grad_norm": 0.5934394001960754,
"learning_rate": 1.3629669484372718e-07,
"loss": 1.0401,
"step": 204
},
{
"epoch": 2.7558528428093645,
"grad_norm": 0.669292151927948,
"learning_rate": 1.21648400487536e-07,
"loss": 1.2259,
"step": 205
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.6402983665466309,
"learning_rate": 1.0782051768070477e-07,
"loss": 1.1421,
"step": 206
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.6059122085571289,
"learning_rate": 9.481597152026654e-08,
"loss": 1.1015,
"step": 207
},
{
"epoch": 2.7959866220735785,
"grad_norm": 0.6018065214157104,
"learning_rate": 8.263751293779408e-08,
"loss": 1.1427,
"step": 208
},
{
"epoch": 2.809364548494983,
"grad_norm": 0.6087521910667419,
"learning_rate": 7.128771811747736e-08,
"loss": 1.1633,
"step": 209
},
{
"epoch": 2.822742474916388,
"grad_norm": 0.6287218928337097,
"learning_rate": 6.076898795116792e-08,
"loss": 1.1612,
"step": 210
},
{
"epoch": 2.8361204013377925,
"grad_norm": 0.6059502363204956,
"learning_rate": 5.108354753050381e-08,
"loss": 1.0879,
"step": 211
},
{
"epoch": 2.849498327759197,
"grad_norm": 0.5889873504638672,
"learning_rate": 4.2233445676222114e-08,
"loss": 1.1121,
"step": 212
},
{
"epoch": 2.862876254180602,
"grad_norm": 0.6066433787345886,
"learning_rate": 3.422055450475847e-08,
"loss": 1.102,
"step": 213
},
{
"epoch": 2.8762541806020065,
"grad_norm": 0.6160590648651123,
"learning_rate": 2.7046569032227905e-08,
"loss": 1.1017,
"step": 214
},
{
"epoch": 2.8896321070234112,
"grad_norm": 0.6136374473571777,
"learning_rate": 2.0713006815868074e-08,
"loss": 1.1346,
"step": 215
},
{
"epoch": 2.903010033444816,
"grad_norm": 0.6150422692298889,
"learning_rate": 1.522120763301782e-08,
"loss": 1.1271,
"step": 216
},
{
"epoch": 2.9163879598662206,
"grad_norm": 0.6284250617027283,
"learning_rate": 1.0572333197711003e-08,
"loss": 1.1855,
"step": 217
},
{
"epoch": 2.9297658862876252,
"grad_norm": 0.5995332598686218,
"learning_rate": 6.767366914927297e-09,
"loss": 1.1039,
"step": 218
},
{
"epoch": 2.94314381270903,
"grad_norm": 0.5566285252571106,
"learning_rate": 3.807113672568807e-09,
"loss": 1.0683,
"step": 219
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.5810141563415527,
"learning_rate": 1.6921996711976028e-09,
"loss": 1.1098,
"step": 220
},
{
"epoch": 2.9698996655518393,
"grad_norm": 0.6116142868995667,
"learning_rate": 4.230722915701257e-10,
"loss": 1.0441,
"step": 221
},
{
"epoch": 2.983277591973244,
"grad_norm": 0.6009790301322937,
"learning_rate": 0.0,
"loss": 1.1404,
"step": 222
},
{
"epoch": 2.983277591973244,
"eval_loss": 0.8089934587478638,
"eval_runtime": 13.4287,
"eval_samples_per_second": 90.701,
"eval_steps_per_second": 5.734,
"step": 222
}
],
"logging_steps": 1,
"max_steps": 222,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 37,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.24107627264947e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}