bad-apple-8b-v2 / trainer_state.json
kalomaze's picture
Upload folder using huggingface_hub
b0c49f0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 268,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0037313432835820895,
"grad_norm": 11.718372519156443,
"learning_rate": 2e-07,
"loss": 1.5946,
"step": 1
},
{
"epoch": 0.007462686567164179,
"grad_norm": 12.194885050328885,
"learning_rate": 4e-07,
"loss": 1.6052,
"step": 2
},
{
"epoch": 0.011194029850746268,
"grad_norm": 13.01588928537949,
"learning_rate": 6e-07,
"loss": 1.7294,
"step": 3
},
{
"epoch": 0.014925373134328358,
"grad_norm": 12.313961908592717,
"learning_rate": 8e-07,
"loss": 1.5217,
"step": 4
},
{
"epoch": 0.018656716417910446,
"grad_norm": 12.37356816651012,
"learning_rate": 1e-06,
"loss": 1.3742,
"step": 5
},
{
"epoch": 0.022388059701492536,
"grad_norm": 11.427627877521694,
"learning_rate": 1.2e-06,
"loss": 1.4112,
"step": 6
},
{
"epoch": 0.026119402985074626,
"grad_norm": 10.518944089588336,
"learning_rate": 1.4e-06,
"loss": 1.6147,
"step": 7
},
{
"epoch": 0.029850746268656716,
"grad_norm": 8.76984656973648,
"learning_rate": 1.6e-06,
"loss": 1.4108,
"step": 8
},
{
"epoch": 0.033582089552238806,
"grad_norm": 8.084328772350803,
"learning_rate": 1.8e-06,
"loss": 1.2484,
"step": 9
},
{
"epoch": 0.03731343283582089,
"grad_norm": 9.697158992831765,
"learning_rate": 2e-06,
"loss": 1.23,
"step": 10
},
{
"epoch": 0.041044776119402986,
"grad_norm": 18.09050975452378,
"learning_rate": 1.9999821640202585e-06,
"loss": 1.2535,
"step": 11
},
{
"epoch": 0.04477611940298507,
"grad_norm": 18.87482868101649,
"learning_rate": 1.9999286567172775e-06,
"loss": 1.7845,
"step": 12
},
{
"epoch": 0.048507462686567165,
"grad_norm": 16.46811558730055,
"learning_rate": 1.999839479999768e-06,
"loss": 1.5637,
"step": 13
},
{
"epoch": 0.05223880597014925,
"grad_norm": 17.76805068894177,
"learning_rate": 1.999714637048838e-06,
"loss": 1.3015,
"step": 14
},
{
"epoch": 0.055970149253731345,
"grad_norm": 16.79626847324504,
"learning_rate": 1.9995541323178804e-06,
"loss": 1.6793,
"step": 15
},
{
"epoch": 0.05970149253731343,
"grad_norm": 11.89468375626604,
"learning_rate": 1.9993579715324135e-06,
"loss": 1.3764,
"step": 16
},
{
"epoch": 0.06343283582089553,
"grad_norm": 10.93288879427306,
"learning_rate": 1.9991261616898766e-06,
"loss": 1.3707,
"step": 17
},
{
"epoch": 0.06716417910447761,
"grad_norm": 8.361062086748422,
"learning_rate": 1.9988587110593807e-06,
"loss": 1.6238,
"step": 18
},
{
"epoch": 0.0708955223880597,
"grad_norm": 6.736084108094181,
"learning_rate": 1.9985556291814147e-06,
"loss": 1.2496,
"step": 19
},
{
"epoch": 0.07462686567164178,
"grad_norm": 7.272816093776056,
"learning_rate": 1.9982169268675023e-06,
"loss": 1.5627,
"step": 20
},
{
"epoch": 0.07835820895522388,
"grad_norm": 6.329740952982906,
"learning_rate": 1.997842616199819e-06,
"loss": 1.3453,
"step": 21
},
{
"epoch": 0.08208955223880597,
"grad_norm": 4.520010885801565,
"learning_rate": 1.99743271053076e-06,
"loss": 1.5461,
"step": 22
},
{
"epoch": 0.08582089552238806,
"grad_norm": 3.6042605956174354,
"learning_rate": 1.9969872244824635e-06,
"loss": 1.5243,
"step": 23
},
{
"epoch": 0.08955223880597014,
"grad_norm": 6.07998924732389,
"learning_rate": 1.99650617394629e-06,
"loss": 1.1925,
"step": 24
},
{
"epoch": 0.09328358208955224,
"grad_norm": 5.379400172850125,
"learning_rate": 1.9959895760822544e-06,
"loss": 1.3644,
"step": 25
},
{
"epoch": 0.09701492537313433,
"grad_norm": 5.724766090503273,
"learning_rate": 1.995437449318415e-06,
"loss": 1.282,
"step": 26
},
{
"epoch": 0.10074626865671642,
"grad_norm": 7.952440800639504,
"learning_rate": 1.994849813350215e-06,
"loss": 1.2719,
"step": 27
},
{
"epoch": 0.1044776119402985,
"grad_norm": 5.5605547177462915,
"learning_rate": 1.9942266891397812e-06,
"loss": 1.4344,
"step": 28
},
{
"epoch": 0.10820895522388059,
"grad_norm": 4.20729688038585,
"learning_rate": 1.9935680989151754e-06,
"loss": 1.4261,
"step": 29
},
{
"epoch": 0.11194029850746269,
"grad_norm": 4.179661464504617,
"learning_rate": 1.9928740661696007e-06,
"loss": 1.7263,
"step": 30
},
{
"epoch": 0.11567164179104478,
"grad_norm": 4.444811669926014,
"learning_rate": 1.992144615660566e-06,
"loss": 1.4733,
"step": 31
},
{
"epoch": 0.11940298507462686,
"grad_norm": 4.028798590389647,
"learning_rate": 1.9913797734089995e-06,
"loss": 1.2016,
"step": 32
},
{
"epoch": 0.12313432835820895,
"grad_norm": 3.667451061021437,
"learning_rate": 1.990579566698323e-06,
"loss": 1.3823,
"step": 33
},
{
"epoch": 0.12686567164179105,
"grad_norm": 3.204493378117863,
"learning_rate": 1.9897440240734786e-06,
"loss": 1.1922,
"step": 34
},
{
"epoch": 0.13059701492537312,
"grad_norm": 2.5157465668345225,
"learning_rate": 1.9888731753399087e-06,
"loss": 1.3169,
"step": 35
},
{
"epoch": 0.13432835820895522,
"grad_norm": 4.013532823842619,
"learning_rate": 1.9879670515624933e-06,
"loss": 1.5473,
"step": 36
},
{
"epoch": 0.13805970149253732,
"grad_norm": 4.443422336518452,
"learning_rate": 1.9870256850644436e-06,
"loss": 1.3413,
"step": 37
},
{
"epoch": 0.1417910447761194,
"grad_norm": 3.8185223864854443,
"learning_rate": 1.9860491094261476e-06,
"loss": 1.3775,
"step": 38
},
{
"epoch": 0.1455223880597015,
"grad_norm": 3.2374300805591356,
"learning_rate": 1.9850373594839715e-06,
"loss": 1.4237,
"step": 39
},
{
"epoch": 0.14925373134328357,
"grad_norm": 3.4010570222238945,
"learning_rate": 1.9839904713290183e-06,
"loss": 1.3512,
"step": 40
},
{
"epoch": 0.15298507462686567,
"grad_norm": 3.7069360522451036,
"learning_rate": 1.9829084823058396e-06,
"loss": 1.3539,
"step": 41
},
{
"epoch": 0.15671641791044777,
"grad_norm": 4.189014809102456,
"learning_rate": 1.9817914310111044e-06,
"loss": 1.184,
"step": 42
},
{
"epoch": 0.16044776119402984,
"grad_norm": 2.6617535657025764,
"learning_rate": 1.980639357292221e-06,
"loss": 0.9118,
"step": 43
},
{
"epoch": 0.16417910447761194,
"grad_norm": 5.853386102660827,
"learning_rate": 1.9794523022459164e-06,
"loss": 1.2803,
"step": 44
},
{
"epoch": 0.16791044776119404,
"grad_norm": 6.062445920092195,
"learning_rate": 1.9782303082167703e-06,
"loss": 1.1335,
"step": 45
},
{
"epoch": 0.17164179104477612,
"grad_norm": 3.931954985578852,
"learning_rate": 1.976973418795704e-06,
"loss": 1.3316,
"step": 46
},
{
"epoch": 0.17537313432835822,
"grad_norm": 4.463556977447332,
"learning_rate": 1.9756816788184255e-06,
"loss": 1.0166,
"step": 47
},
{
"epoch": 0.1791044776119403,
"grad_norm": 4.622340592834071,
"learning_rate": 1.974355134363832e-06,
"loss": 1.3222,
"step": 48
},
{
"epoch": 0.1828358208955224,
"grad_norm": 6.981137384760035,
"learning_rate": 1.972993832752363e-06,
"loss": 1.2864,
"step": 49
},
{
"epoch": 0.1865671641791045,
"grad_norm": 7.538142483116736,
"learning_rate": 1.9715978225443146e-06,
"loss": 1.3298,
"step": 50
},
{
"epoch": 0.19029850746268656,
"grad_norm": 5.378377013811346,
"learning_rate": 1.970167153538106e-06,
"loss": 1.6898,
"step": 51
},
{
"epoch": 0.19402985074626866,
"grad_norm": 4.107675209526753,
"learning_rate": 1.9687018767685044e-06,
"loss": 1.2467,
"step": 52
},
{
"epoch": 0.19776119402985073,
"grad_norm": 3.462570336890335,
"learning_rate": 1.9672020445048035e-06,
"loss": 1.2439,
"step": 53
},
{
"epoch": 0.20149253731343283,
"grad_norm": 5.1132284991358565,
"learning_rate": 1.9656677102489587e-06,
"loss": 1.2553,
"step": 54
},
{
"epoch": 0.20522388059701493,
"grad_norm": 6.340348167811004,
"learning_rate": 1.964098928733679e-06,
"loss": 1.1768,
"step": 55
},
{
"epoch": 0.208955223880597,
"grad_norm": 3.9131654258908415,
"learning_rate": 1.962495755920476e-06,
"loss": 1.2787,
"step": 56
},
{
"epoch": 0.2126865671641791,
"grad_norm": 4.822496966837534,
"learning_rate": 1.9608582489976645e-06,
"loss": 0.9751,
"step": 57
},
{
"epoch": 0.21641791044776118,
"grad_norm": 6.830738771850188,
"learning_rate": 1.959186466378326e-06,
"loss": 1.1173,
"step": 58
},
{
"epoch": 0.22014925373134328,
"grad_norm": 3.7173186685103716,
"learning_rate": 1.9574804676982214e-06,
"loss": 1.3968,
"step": 59
},
{
"epoch": 0.22388059701492538,
"grad_norm": 2.826816507380959,
"learning_rate": 1.955740313813667e-06,
"loss": 1.2454,
"step": 60
},
{
"epoch": 0.22761194029850745,
"grad_norm": 5.060694147656036,
"learning_rate": 1.9539660667993617e-06,
"loss": 1.2803,
"step": 61
},
{
"epoch": 0.23134328358208955,
"grad_norm": 4.866896848059289,
"learning_rate": 1.952157789946173e-06,
"loss": 1.3675,
"step": 62
},
{
"epoch": 0.23507462686567165,
"grad_norm": 3.6902361398869843,
"learning_rate": 1.9503155477588792e-06,
"loss": 1.3265,
"step": 63
},
{
"epoch": 0.23880597014925373,
"grad_norm": 3.844263841899348,
"learning_rate": 1.9484394059538696e-06,
"loss": 1.1316,
"step": 64
},
{
"epoch": 0.24253731343283583,
"grad_norm": 2.7586273990747094,
"learning_rate": 1.9465294314567986e-06,
"loss": 1.1227,
"step": 65
},
{
"epoch": 0.2462686567164179,
"grad_norm": 4.486151113202917,
"learning_rate": 1.9445856924001987e-06,
"loss": 1.2286,
"step": 66
},
{
"epoch": 0.25,
"grad_norm": 3.4213181164419453,
"learning_rate": 1.9426082581210507e-06,
"loss": 1.1066,
"step": 67
},
{
"epoch": 0.2537313432835821,
"grad_norm": 3.2763985271498646,
"learning_rate": 1.9405971991583107e-06,
"loss": 1.1251,
"step": 68
},
{
"epoch": 0.2574626865671642,
"grad_norm": 3.2345913669334014,
"learning_rate": 1.9385525872503914e-06,
"loss": 1.1556,
"step": 69
},
{
"epoch": 0.26119402985074625,
"grad_norm": 3.106534312573296,
"learning_rate": 1.9364744953326073e-06,
"loss": 1.0577,
"step": 70
},
{
"epoch": 0.26492537313432835,
"grad_norm": 2.9408023039448272,
"learning_rate": 1.9343629975345684e-06,
"loss": 0.9973,
"step": 71
},
{
"epoch": 0.26865671641791045,
"grad_norm": 4.559495537184346,
"learning_rate": 1.9322181691775386e-06,
"loss": 1.2465,
"step": 72
},
{
"epoch": 0.27238805970149255,
"grad_norm": 4.24529782366461,
"learning_rate": 1.9300400867717483e-06,
"loss": 1.0913,
"step": 73
},
{
"epoch": 0.27611940298507465,
"grad_norm": 5.1030664391900595,
"learning_rate": 1.9278288280136647e-06,
"loss": 1.1773,
"step": 74
},
{
"epoch": 0.2798507462686567,
"grad_norm": 2.266554727723586,
"learning_rate": 1.9255844717832204e-06,
"loss": 1.4612,
"step": 75
},
{
"epoch": 0.2835820895522388,
"grad_norm": 3.4335407395713795,
"learning_rate": 1.9233070981410005e-06,
"loss": 0.9848,
"step": 76
},
{
"epoch": 0.2873134328358209,
"grad_norm": 3.0840869155577066,
"learning_rate": 1.9209967883253844e-06,
"loss": 1.1614,
"step": 77
},
{
"epoch": 0.291044776119403,
"grad_norm": 3.9174952308981648,
"learning_rate": 1.9186536247496515e-06,
"loss": 0.993,
"step": 78
},
{
"epoch": 0.2947761194029851,
"grad_norm": 6.675359765928192,
"learning_rate": 1.916277690999037e-06,
"loss": 1.1993,
"step": 79
},
{
"epoch": 0.29850746268656714,
"grad_norm": 3.3312539449544336,
"learning_rate": 1.9138690718277538e-06,
"loss": 1.1122,
"step": 80
},
{
"epoch": 0.30223880597014924,
"grad_norm": 3.848892934397973,
"learning_rate": 1.9114278531559673e-06,
"loss": 1.2558,
"step": 81
},
{
"epoch": 0.30597014925373134,
"grad_norm": 4.176454718168841,
"learning_rate": 1.908954122066731e-06,
"loss": 1.1981,
"step": 82
},
{
"epoch": 0.30970149253731344,
"grad_norm": 2.73071312325315,
"learning_rate": 1.9064479668028799e-06,
"loss": 1.2421,
"step": 83
},
{
"epoch": 0.31343283582089554,
"grad_norm": 3.195988999340381,
"learning_rate": 1.903909476763883e-06,
"loss": 1.1304,
"step": 84
},
{
"epoch": 0.31716417910447764,
"grad_norm": 4.218603688955521,
"learning_rate": 1.9013387425026548e-06,
"loss": 1.1864,
"step": 85
},
{
"epoch": 0.3208955223880597,
"grad_norm": 3.6984773161373514,
"learning_rate": 1.8987358557223229e-06,
"loss": 1.1586,
"step": 86
},
{
"epoch": 0.3246268656716418,
"grad_norm": 3.92008728241735,
"learning_rate": 1.8961009092729597e-06,
"loss": 1.4377,
"step": 87
},
{
"epoch": 0.3283582089552239,
"grad_norm": 3.891396254447255,
"learning_rate": 1.8934339971482673e-06,
"loss": 0.8258,
"step": 88
},
{
"epoch": 0.332089552238806,
"grad_norm": 3.0403561652586593,
"learning_rate": 1.8907352144822281e-06,
"loss": 1.1502,
"step": 89
},
{
"epoch": 0.3358208955223881,
"grad_norm": 4.226515814729617,
"learning_rate": 1.8880046575457071e-06,
"loss": 1.3202,
"step": 90
},
{
"epoch": 0.33955223880597013,
"grad_norm": 3.1371524388050562,
"learning_rate": 1.8852424237430213e-06,
"loss": 1.0916,
"step": 91
},
{
"epoch": 0.34328358208955223,
"grad_norm": 5.798409840680744,
"learning_rate": 1.882448611608463e-06,
"loss": 1.0295,
"step": 92
},
{
"epoch": 0.34701492537313433,
"grad_norm": 3.20282503021626,
"learning_rate": 1.8796233208027847e-06,
"loss": 1.0562,
"step": 93
},
{
"epoch": 0.35074626865671643,
"grad_norm": 4.8622829880049165,
"learning_rate": 1.8767666521096466e-06,
"loss": 1.3517,
"step": 94
},
{
"epoch": 0.35447761194029853,
"grad_norm": 3.236061343689611,
"learning_rate": 1.8738787074320176e-06,
"loss": 1.3072,
"step": 95
},
{
"epoch": 0.3582089552238806,
"grad_norm": 2.8290205291903834,
"learning_rate": 1.8709595897885436e-06,
"loss": 1.0689,
"step": 96
},
{
"epoch": 0.3619402985074627,
"grad_norm": 9.248683748830997,
"learning_rate": 1.8680094033098714e-06,
"loss": 1.1408,
"step": 97
},
{
"epoch": 0.3656716417910448,
"grad_norm": 3.351494222719439,
"learning_rate": 1.865028253234933e-06,
"loss": 1.0601,
"step": 98
},
{
"epoch": 0.3694029850746269,
"grad_norm": 2.9558835746922982,
"learning_rate": 1.8620162459071933e-06,
"loss": 1.469,
"step": 99
},
{
"epoch": 0.373134328358209,
"grad_norm": 4.188173168279777,
"learning_rate": 1.8589734887708555e-06,
"loss": 1.0811,
"step": 100
},
{
"epoch": 0.376865671641791,
"grad_norm": 4.149900932370019,
"learning_rate": 1.855900090367029e-06,
"loss": 1.2524,
"step": 101
},
{
"epoch": 0.3805970149253731,
"grad_norm": 3.4354588132333954,
"learning_rate": 1.852796160329857e-06,
"loss": 1.2036,
"step": 102
},
{
"epoch": 0.3843283582089552,
"grad_norm": 7.008352254151579,
"learning_rate": 1.8496618093826062e-06,
"loss": 1.3326,
"step": 103
},
{
"epoch": 0.3880597014925373,
"grad_norm": 4.57793539598067,
"learning_rate": 1.8464971493337165e-06,
"loss": 1.1313,
"step": 104
},
{
"epoch": 0.3917910447761194,
"grad_norm": 5.023484484297041,
"learning_rate": 1.843302293072813e-06,
"loss": 1.1537,
"step": 105
},
{
"epoch": 0.39552238805970147,
"grad_norm": 5.417491975498189,
"learning_rate": 1.8400773545666786e-06,
"loss": 1.1948,
"step": 106
},
{
"epoch": 0.39925373134328357,
"grad_norm": 6.758289030774797,
"learning_rate": 1.8368224488551895e-06,
"loss": 1.4521,
"step": 107
},
{
"epoch": 0.40298507462686567,
"grad_norm": 3.113115656591011,
"learning_rate": 1.8335376920472096e-06,
"loss": 1.3848,
"step": 108
},
{
"epoch": 0.40671641791044777,
"grad_norm": 6.762254585306641,
"learning_rate": 1.8302232013164516e-06,
"loss": 1.157,
"step": 109
},
{
"epoch": 0.41044776119402987,
"grad_norm": 4.658736688116275,
"learning_rate": 1.8268790948972938e-06,
"loss": 1.0968,
"step": 110
},
{
"epoch": 0.4141791044776119,
"grad_norm": 4.3724969896587,
"learning_rate": 1.8235054920805651e-06,
"loss": 1.3121,
"step": 111
},
{
"epoch": 0.417910447761194,
"grad_norm": 1.9047640764173237,
"learning_rate": 1.8201025132092886e-06,
"loss": 0.966,
"step": 112
},
{
"epoch": 0.4216417910447761,
"grad_norm": 2.6109096330803645,
"learning_rate": 1.8166702796743888e-06,
"loss": 0.9965,
"step": 113
},
{
"epoch": 0.4253731343283582,
"grad_norm": 2.236001574200741,
"learning_rate": 1.813208913910361e-06,
"loss": 1.2068,
"step": 114
},
{
"epoch": 0.4291044776119403,
"grad_norm": 3.5252250250259904,
"learning_rate": 1.8097185393909047e-06,
"loss": 0.9945,
"step": 115
},
{
"epoch": 0.43283582089552236,
"grad_norm": 2.72651289344666,
"learning_rate": 1.8061992806245183e-06,
"loss": 1.1221,
"step": 116
},
{
"epoch": 0.43656716417910446,
"grad_norm": 2.7580533850806663,
"learning_rate": 1.802651263150058e-06,
"loss": 1.1106,
"step": 117
},
{
"epoch": 0.44029850746268656,
"grad_norm": 3.36730794119517,
"learning_rate": 1.7990746135322592e-06,
"loss": 1.3169,
"step": 118
},
{
"epoch": 0.44402985074626866,
"grad_norm": 3.602491306889906,
"learning_rate": 1.7954694593572225e-06,
"loss": 1.2271,
"step": 119
},
{
"epoch": 0.44776119402985076,
"grad_norm": 2.995624556867228,
"learning_rate": 1.7918359292278611e-06,
"loss": 1.4585,
"step": 120
},
{
"epoch": 0.45149253731343286,
"grad_norm": 2.4713376740401394,
"learning_rate": 1.7881741527593148e-06,
"loss": 1.0635,
"step": 121
},
{
"epoch": 0.4552238805970149,
"grad_norm": 2.9859360943624558,
"learning_rate": 1.7844842605743255e-06,
"loss": 1.1158,
"step": 122
},
{
"epoch": 0.458955223880597,
"grad_norm": 2.236880197413377,
"learning_rate": 1.7807663842985776e-06,
"loss": 1.0568,
"step": 123
},
{
"epoch": 0.4626865671641791,
"grad_norm": 2.812929367683673,
"learning_rate": 1.777020656556003e-06,
"loss": 0.9711,
"step": 124
},
{
"epoch": 0.4664179104477612,
"grad_norm": 2.615520237147118,
"learning_rate": 1.77324721096405e-06,
"loss": 1.2155,
"step": 125
},
{
"epoch": 0.4701492537313433,
"grad_norm": 2.56945954741141,
"learning_rate": 1.7694461821289171e-06,
"loss": 1.2214,
"step": 126
},
{
"epoch": 0.47388059701492535,
"grad_norm": 2.6524624585109255,
"learning_rate": 1.7656177056407504e-06,
"loss": 1.0783,
"step": 127
},
{
"epoch": 0.47761194029850745,
"grad_norm": 3.900106255085836,
"learning_rate": 1.7617619180688084e-06,
"loss": 1.1345,
"step": 128
},
{
"epoch": 0.48134328358208955,
"grad_norm": 3.6445674759996973,
"learning_rate": 1.7578789569565889e-06,
"loss": 1.1407,
"step": 129
},
{
"epoch": 0.48507462686567165,
"grad_norm": 3.2321962413724834,
"learning_rate": 1.7539689608169236e-06,
"loss": 1.2281,
"step": 130
},
{
"epoch": 0.48880597014925375,
"grad_norm": 4.609891513693221,
"learning_rate": 1.7500320691270363e-06,
"loss": 1.2394,
"step": 131
},
{
"epoch": 0.4925373134328358,
"grad_norm": 6.929277355854441,
"learning_rate": 1.7460684223235678e-06,
"loss": 1.233,
"step": 132
},
{
"epoch": 0.4962686567164179,
"grad_norm": 3.231892093569866,
"learning_rate": 1.7420781617975663e-06,
"loss": 0.9962,
"step": 133
},
{
"epoch": 0.5,
"grad_norm": 2.5090780090395115,
"learning_rate": 1.738061429889444e-06,
"loss": 0.9036,
"step": 134
},
{
"epoch": 0.503731343283582,
"grad_norm": 4.328856707412604,
"learning_rate": 1.734018369883898e-06,
"loss": 1.1895,
"step": 135
},
{
"epoch": 0.5074626865671642,
"grad_norm": 2.5691655391373875,
"learning_rate": 1.7299491260048019e-06,
"loss": 1.326,
"step": 136
},
{
"epoch": 0.5111940298507462,
"grad_norm": 1.732025147621955,
"learning_rate": 1.7258538434100576e-06,
"loss": 1.2479,
"step": 137
},
{
"epoch": 0.5149253731343284,
"grad_norm": 2.20121934912806,
"learning_rate": 1.7217326681864206e-06,
"loss": 1.0356,
"step": 138
},
{
"epoch": 0.5186567164179104,
"grad_norm": 1.9985655432331606,
"learning_rate": 1.717585747344286e-06,
"loss": 1.1547,
"step": 139
},
{
"epoch": 0.5223880597014925,
"grad_norm": 1.8929729658584291,
"learning_rate": 1.7134132288124464e-06,
"loss": 1.1972,
"step": 140
},
{
"epoch": 0.5261194029850746,
"grad_norm": 2.5705526162651284,
"learning_rate": 1.7092152614328136e-06,
"loss": 0.9647,
"step": 141
},
{
"epoch": 0.5298507462686567,
"grad_norm": 2.3300073188215134,
"learning_rate": 1.7049919949551099e-06,
"loss": 1.4177,
"step": 142
},
{
"epoch": 0.5335820895522388,
"grad_norm": 2.919579845785974,
"learning_rate": 1.7007435800315261e-06,
"loss": 1.0245,
"step": 143
},
{
"epoch": 0.5373134328358209,
"grad_norm": 2.58628020939173,
"learning_rate": 1.6964701682113474e-06,
"loss": 1.1438,
"step": 144
},
{
"epoch": 0.5410447761194029,
"grad_norm": 2.1810582155175906,
"learning_rate": 1.6921719119355466e-06,
"loss": 1.1709,
"step": 145
},
{
"epoch": 0.5447761194029851,
"grad_norm": 2.0256539029853036,
"learning_rate": 1.687848964531348e-06,
"loss": 1.2567,
"step": 146
},
{
"epoch": 0.5485074626865671,
"grad_norm": 2.6789651782329003,
"learning_rate": 1.6835014802067556e-06,
"loss": 1.2105,
"step": 147
},
{
"epoch": 0.5522388059701493,
"grad_norm": 2.2475712729751813,
"learning_rate": 1.6791296140450543e-06,
"loss": 1.0036,
"step": 148
},
{
"epoch": 0.5559701492537313,
"grad_norm": 3.081758388528468,
"learning_rate": 1.6747335219992774e-06,
"loss": 1.229,
"step": 149
},
{
"epoch": 0.5597014925373134,
"grad_norm": 3.4435580918281903,
"learning_rate": 1.6703133608866414e-06,
"loss": 1.2375,
"step": 150
},
{
"epoch": 0.5634328358208955,
"grad_norm": 3.6488645320162263,
"learning_rate": 1.6658692883829546e-06,
"loss": 1.2528,
"step": 151
},
{
"epoch": 0.5671641791044776,
"grad_norm": 2.6147378121358535,
"learning_rate": 1.6614014630169915e-06,
"loss": 1.0683,
"step": 152
},
{
"epoch": 0.5708955223880597,
"grad_norm": 3.4412924263138502,
"learning_rate": 1.6569100441648372e-06,
"loss": 1.2073,
"step": 153
},
{
"epoch": 0.5746268656716418,
"grad_norm": 3.8345977623754117,
"learning_rate": 1.6523951920442032e-06,
"loss": 1.1582,
"step": 154
},
{
"epoch": 0.5783582089552238,
"grad_norm": 3.049354065878489,
"learning_rate": 1.6478570677087116e-06,
"loss": 1.26,
"step": 155
},
{
"epoch": 0.582089552238806,
"grad_norm": 2.667157342873667,
"learning_rate": 1.6432958330421497e-06,
"loss": 1.1972,
"step": 156
},
{
"epoch": 0.585820895522388,
"grad_norm": 2.3988481838806517,
"learning_rate": 1.6387116507526955e-06,
"loss": 1.0296,
"step": 157
},
{
"epoch": 0.5895522388059702,
"grad_norm": 3.245331214881116,
"learning_rate": 1.6341046843671142e-06,
"loss": 1.0837,
"step": 158
},
{
"epoch": 0.5932835820895522,
"grad_norm": 2.740295402410237,
"learning_rate": 1.629475098224924e-06,
"loss": 1.0756,
"step": 159
},
{
"epoch": 0.5970149253731343,
"grad_norm": 3.8088940588573625,
"learning_rate": 1.6248230574725338e-06,
"loss": 1.2506,
"step": 160
},
{
"epoch": 0.6007462686567164,
"grad_norm": 5.166361637828825,
"learning_rate": 1.6201487280573533e-06,
"loss": 0.9793,
"step": 161
},
{
"epoch": 0.6044776119402985,
"grad_norm": 3.2415888531812485,
"learning_rate": 1.6154522767218723e-06,
"loss": 1.3401,
"step": 162
},
{
"epoch": 0.6082089552238806,
"grad_norm": 3.305335197143126,
"learning_rate": 1.6107338709977118e-06,
"loss": 1.4258,
"step": 163
},
{
"epoch": 0.6119402985074627,
"grad_norm": 3.074545865145304,
"learning_rate": 1.6059936791996497e-06,
"loss": 1.192,
"step": 164
},
{
"epoch": 0.6156716417910447,
"grad_norm": 2.40059366672424,
"learning_rate": 1.601231870419616e-06,
"loss": 0.984,
"step": 165
},
{
"epoch": 0.6194029850746269,
"grad_norm": 2.9844452713407197,
"learning_rate": 1.596448614520661e-06,
"loss": 1.1051,
"step": 166
},
{
"epoch": 0.6231343283582089,
"grad_norm": 3.241493731745323,
"learning_rate": 1.5916440821308947e-06,
"loss": 1.1032,
"step": 167
},
{
"epoch": 0.6268656716417911,
"grad_norm": 4.002012008083462,
"learning_rate": 1.586818444637402e-06,
"loss": 1.1281,
"step": 168
},
{
"epoch": 0.6305970149253731,
"grad_norm": 2.891081597812952,
"learning_rate": 1.5819718741801282e-06,
"loss": 1.0984,
"step": 169
},
{
"epoch": 0.6343283582089553,
"grad_norm": 2.510097661559307,
"learning_rate": 1.577104543645738e-06,
"loss": 0.9818,
"step": 170
},
{
"epoch": 0.6380597014925373,
"grad_norm": 3.9519151526817784,
"learning_rate": 1.5722166266614494e-06,
"loss": 1.403,
"step": 171
},
{
"epoch": 0.6417910447761194,
"grad_norm": 2.889629899144798,
"learning_rate": 1.5673082975888386e-06,
"loss": 1.4251,
"step": 172
},
{
"epoch": 0.6455223880597015,
"grad_norm": 3.2843979337315203,
"learning_rate": 1.5623797315176217e-06,
"loss": 1.2102,
"step": 173
},
{
"epoch": 0.6492537313432836,
"grad_norm": 3.851544142794571,
"learning_rate": 1.5574311042594077e-06,
"loss": 1.3174,
"step": 174
},
{
"epoch": 0.6529850746268657,
"grad_norm": 3.0632504419966224,
"learning_rate": 1.552462592341428e-06,
"loss": 1.2578,
"step": 175
},
{
"epoch": 0.6567164179104478,
"grad_norm": 2.9143363462552414,
"learning_rate": 1.547474373000238e-06,
"loss": 1.1117,
"step": 176
},
{
"epoch": 0.6604477611940298,
"grad_norm": 3.33708665616015,
"learning_rate": 1.5424666241753963e-06,
"loss": 1.3296,
"step": 177
},
{
"epoch": 0.664179104477612,
"grad_norm": 2.5174595420642767,
"learning_rate": 1.5374395245031157e-06,
"loss": 1.2501,
"step": 178
},
{
"epoch": 0.667910447761194,
"grad_norm": 4.722876645619478,
"learning_rate": 1.5323932533098924e-06,
"loss": 0.8606,
"step": 179
},
{
"epoch": 0.6716417910447762,
"grad_norm": 3.23675727446907,
"learning_rate": 1.527327990606108e-06,
"loss": 1.1848,
"step": 180
},
{
"epoch": 0.6753731343283582,
"grad_norm": 3.2255476770575906,
"learning_rate": 1.522243917079608e-06,
"loss": 1.1501,
"step": 181
},
{
"epoch": 0.6791044776119403,
"grad_norm": 2.660659180388112,
"learning_rate": 1.5171412140892574e-06,
"loss": 1.1792,
"step": 182
},
{
"epoch": 0.6828358208955224,
"grad_norm": 2.5742735359754656,
"learning_rate": 1.512020063658471e-06,
"loss": 1.0524,
"step": 183
},
{
"epoch": 0.6865671641791045,
"grad_norm": 2.7222921596819805,
"learning_rate": 1.5068806484687188e-06,
"loss": 0.9408,
"step": 184
},
{
"epoch": 0.6902985074626866,
"grad_norm": 2.854241224431344,
"learning_rate": 1.5017231518530115e-06,
"loss": 1.1946,
"step": 185
},
{
"epoch": 0.6940298507462687,
"grad_norm": 2.829758100829405,
"learning_rate": 1.4965477577893596e-06,
"loss": 1.0996,
"step": 186
},
{
"epoch": 0.6977611940298507,
"grad_norm": 2.7341811827310907,
"learning_rate": 1.4913546508942104e-06,
"loss": 1.3112,
"step": 187
},
{
"epoch": 0.7014925373134329,
"grad_norm": 2.727595416423421,
"learning_rate": 1.486144016415862e-06,
"loss": 0.8641,
"step": 188
},
{
"epoch": 0.7052238805970149,
"grad_norm": 2.6779321037785957,
"learning_rate": 1.4809160402278572e-06,
"loss": 1.0673,
"step": 189
},
{
"epoch": 0.7089552238805971,
"grad_norm": 2.105636468467112,
"learning_rate": 1.4756709088223507e-06,
"loss": 1.0804,
"step": 190
},
{
"epoch": 0.7126865671641791,
"grad_norm": 2.308917007984876,
"learning_rate": 1.470408809303457e-06,
"loss": 1.0657,
"step": 191
},
{
"epoch": 0.7164179104477612,
"grad_norm": 2.272233759263439,
"learning_rate": 1.4651299293805772e-06,
"loss": 0.97,
"step": 192
},
{
"epoch": 0.7201492537313433,
"grad_norm": 2.38194076941112,
"learning_rate": 1.459834457361702e-06,
"loss": 1.1996,
"step": 193
},
{
"epoch": 0.7238805970149254,
"grad_norm": 2.609236963602244,
"learning_rate": 1.4545225821466949e-06,
"loss": 1.4137,
"step": 194
},
{
"epoch": 0.7276119402985075,
"grad_norm": 2.1583872582681303,
"learning_rate": 1.449194493220553e-06,
"loss": 1.21,
"step": 195
},
{
"epoch": 0.7313432835820896,
"grad_norm": 2.0168668065761004,
"learning_rate": 1.443850380646649e-06,
"loss": 1.2648,
"step": 196
},
{
"epoch": 0.7350746268656716,
"grad_norm": 2.8244668704260434,
"learning_rate": 1.4384904350599496e-06,
"loss": 1.158,
"step": 197
},
{
"epoch": 0.7388059701492538,
"grad_norm": 2.154427501128158,
"learning_rate": 1.433114847660217e-06,
"loss": 1.1111,
"step": 198
},
{
"epoch": 0.7425373134328358,
"grad_norm": 1.905058417754889,
"learning_rate": 1.427723810205187e-06,
"loss": 0.969,
"step": 199
},
{
"epoch": 0.746268656716418,
"grad_norm": 2.739735762190122,
"learning_rate": 1.4223175150037295e-06,
"loss": 1.2142,
"step": 200
},
{
"epoch": 0.75,
"grad_norm": 3.4650290962226777,
"learning_rate": 1.4168961549089872e-06,
"loss": 1.1373,
"step": 201
},
{
"epoch": 0.753731343283582,
"grad_norm": 2.5869478423809786,
"learning_rate": 1.4114599233114986e-06,
"loss": 1.3506,
"step": 202
},
{
"epoch": 0.7574626865671642,
"grad_norm": 3.1980963820842483,
"learning_rate": 1.4060090141322966e-06,
"loss": 1.0384,
"step": 203
},
{
"epoch": 0.7611940298507462,
"grad_norm": 2.5362305958432443,
"learning_rate": 1.4005436218159925e-06,
"loss": 1.1983,
"step": 204
},
{
"epoch": 0.7649253731343284,
"grad_norm": 1.7669955812420282,
"learning_rate": 1.3950639413238393e-06,
"loss": 1.1922,
"step": 205
},
{
"epoch": 0.7686567164179104,
"grad_norm": 3.236818206550707,
"learning_rate": 1.3895701681267782e-06,
"loss": 1.1532,
"step": 206
},
{
"epoch": 0.7723880597014925,
"grad_norm": 3.1410703998345917,
"learning_rate": 1.384062498198464e-06,
"loss": 1.2707,
"step": 207
},
{
"epoch": 0.7761194029850746,
"grad_norm": 2.947726795909021,
"learning_rate": 1.3785411280082746e-06,
"loss": 1.1552,
"step": 208
},
{
"epoch": 0.7798507462686567,
"grad_norm": 4.158405889593859,
"learning_rate": 1.373006254514304e-06,
"loss": 1.1323,
"step": 209
},
{
"epoch": 0.7835820895522388,
"grad_norm": 3.6596410080845483,
"learning_rate": 1.3674580751563357e-06,
"loss": 1.1021,
"step": 210
},
{
"epoch": 0.7873134328358209,
"grad_norm": 3.4837568397902063,
"learning_rate": 1.361896787848798e-06,
"loss": 1.1507,
"step": 211
},
{
"epoch": 0.7910447761194029,
"grad_norm": 5.190434900700764,
"learning_rate": 1.3563225909737074e-06,
"loss": 1.1307,
"step": 212
},
{
"epoch": 0.7947761194029851,
"grad_norm": 3.193649918972427,
"learning_rate": 1.3507356833735885e-06,
"loss": 1.1674,
"step": 213
},
{
"epoch": 0.7985074626865671,
"grad_norm": 3.64309739990448,
"learning_rate": 1.3451362643443831e-06,
"loss": 1.1026,
"step": 214
},
{
"epoch": 0.8022388059701493,
"grad_norm": 4.480821519285648,
"learning_rate": 1.3395245336283396e-06,
"loss": 1.1305,
"step": 215
},
{
"epoch": 0.8059701492537313,
"grad_norm": 2.485764813025922,
"learning_rate": 1.333900691406889e-06,
"loss": 1.0909,
"step": 216
},
{
"epoch": 0.8097014925373134,
"grad_norm": 2.8276534151044417,
"learning_rate": 1.3282649382935028e-06,
"loss": 1.2906,
"step": 217
},
{
"epoch": 0.8134328358208955,
"grad_norm": 2.661022282944918,
"learning_rate": 1.322617475326538e-06,
"loss": 1.0923,
"step": 218
},
{
"epoch": 0.8171641791044776,
"grad_norm": 2.6551254805947053,
"learning_rate": 1.316958503962065e-06,
"loss": 1.1648,
"step": 219
},
{
"epoch": 0.8208955223880597,
"grad_norm": 2.3353396983390486,
"learning_rate": 1.3112882260666805e-06,
"loss": 1.2479,
"step": 220
},
{
"epoch": 0.8246268656716418,
"grad_norm": 1.8853847357875915,
"learning_rate": 1.3056068439103082e-06,
"loss": 0.9367,
"step": 221
},
{
"epoch": 0.8283582089552238,
"grad_norm": 1.7789270126386558,
"learning_rate": 1.299914560158982e-06,
"loss": 0.9866,
"step": 222
},
{
"epoch": 0.832089552238806,
"grad_norm": 4.437767240695352,
"learning_rate": 1.2942115778676175e-06,
"loss": 1.0143,
"step": 223
},
{
"epoch": 0.835820895522388,
"grad_norm": 2.643730633752304,
"learning_rate": 1.2884981004727675e-06,
"loss": 1.1737,
"step": 224
},
{
"epoch": 0.8395522388059702,
"grad_norm": 4.113252275049106,
"learning_rate": 1.2827743317853666e-06,
"loss": 1.278,
"step": 225
},
{
"epoch": 0.8432835820895522,
"grad_norm": 4.473452632975873,
"learning_rate": 1.2770404759834592e-06,
"loss": 1.2337,
"step": 226
},
{
"epoch": 0.8470149253731343,
"grad_norm": 3.3121627389468227,
"learning_rate": 1.2712967376049176e-06,
"loss": 0.9808,
"step": 227
},
{
"epoch": 0.8507462686567164,
"grad_norm": 2.765455947896225,
"learning_rate": 1.2655433215401437e-06,
"loss": 0.809,
"step": 228
},
{
"epoch": 0.8544776119402985,
"grad_norm": 5.806520585625066,
"learning_rate": 1.2597804330247629e-06,
"loss": 1.3475,
"step": 229
},
{
"epoch": 0.8582089552238806,
"grad_norm": 4.3730223037366365,
"learning_rate": 1.2540082776323006e-06,
"loss": 1.0836,
"step": 230
},
{
"epoch": 0.8619402985074627,
"grad_norm": 2.5075803170353987,
"learning_rate": 1.2482270612668507e-06,
"loss": 1.1071,
"step": 231
},
{
"epoch": 0.8656716417910447,
"grad_norm": 3.845367887472252,
"learning_rate": 1.242436990155728e-06,
"loss": 1.249,
"step": 232
},
{
"epoch": 0.8694029850746269,
"grad_norm": 3.2664015113912237,
"learning_rate": 1.2366382708421154e-06,
"loss": 1.1988,
"step": 233
},
{
"epoch": 0.8731343283582089,
"grad_norm": 3.9034686201589586,
"learning_rate": 1.2308311101776932e-06,
"loss": 1.2718,
"step": 234
},
{
"epoch": 0.8768656716417911,
"grad_norm": 2.1523180105846844,
"learning_rate": 1.2250157153152609e-06,
"loss": 1.1845,
"step": 235
},
{
"epoch": 0.8805970149253731,
"grad_norm": 2.8559179892421116,
"learning_rate": 1.2191922937013488e-06,
"loss": 1.2277,
"step": 236
},
{
"epoch": 0.8843283582089553,
"grad_norm": 2.4964069518984697,
"learning_rate": 1.2133610530688167e-06,
"loss": 1.1304,
"step": 237
},
{
"epoch": 0.8880597014925373,
"grad_norm": 1.6510558048415136,
"learning_rate": 1.2075222014294447e-06,
"loss": 1.0716,
"step": 238
},
{
"epoch": 0.8917910447761194,
"grad_norm": 4.138846396996276,
"learning_rate": 1.2016759470665109e-06,
"loss": 1.1715,
"step": 239
},
{
"epoch": 0.8955223880597015,
"grad_norm": 2.591000009602829,
"learning_rate": 1.1958224985273645e-06,
"loss": 1.2082,
"step": 240
},
{
"epoch": 0.8992537313432836,
"grad_norm": 1.6656445235525534,
"learning_rate": 1.1899620646159853e-06,
"loss": 1.057,
"step": 241
},
{
"epoch": 0.9029850746268657,
"grad_norm": 3.4344302308874486,
"learning_rate": 1.1840948543855334e-06,
"loss": 0.9381,
"step": 242
},
{
"epoch": 0.9067164179104478,
"grad_norm": 2.5448689987449384,
"learning_rate": 1.1782210771308947e-06,
"loss": 1.1778,
"step": 243
},
{
"epoch": 0.9104477611940298,
"grad_norm": 2.2000538592782433,
"learning_rate": 1.1723409423812134e-06,
"loss": 1.1269,
"step": 244
},
{
"epoch": 0.914179104477612,
"grad_norm": 1.6886830906655088,
"learning_rate": 1.1664546598924184e-06,
"loss": 1.1615,
"step": 245
},
{
"epoch": 0.917910447761194,
"grad_norm": 2.2494221352588886,
"learning_rate": 1.1605624396397398e-06,
"loss": 1.4029,
"step": 246
},
{
"epoch": 0.9216417910447762,
"grad_norm": 2.012712883705275,
"learning_rate": 1.1546644918102196e-06,
"loss": 1.1799,
"step": 247
},
{
"epoch": 0.9253731343283582,
"grad_norm": 2.3518817671490586,
"learning_rate": 1.1487610267952142e-06,
"loss": 1.1566,
"step": 248
},
{
"epoch": 0.9291044776119403,
"grad_norm": 2.0646756101710593,
"learning_rate": 1.1428522551828882e-06,
"loss": 1.2883,
"step": 249
},
{
"epoch": 0.9328358208955224,
"grad_norm": 1.812081401132651,
"learning_rate": 1.1369383877507034e-06,
"loss": 1.2653,
"step": 250
},
{
"epoch": 0.9365671641791045,
"grad_norm": 2.242364078092567,
"learning_rate": 1.131019635457899e-06,
"loss": 1.1829,
"step": 251
},
{
"epoch": 0.9402985074626866,
"grad_norm": 2.0182289267611258,
"learning_rate": 1.1250962094379668e-06,
"loss": 0.9778,
"step": 252
},
{
"epoch": 0.9440298507462687,
"grad_norm": 2.4797725291235593,
"learning_rate": 1.1191683209911201e-06,
"loss": 1.0714,
"step": 253
},
{
"epoch": 0.9477611940298507,
"grad_norm": 2.519766746322205,
"learning_rate": 1.1132361815767552e-06,
"loss": 1.2406,
"step": 254
},
{
"epoch": 0.9514925373134329,
"grad_norm": 2.063185346641498,
"learning_rate": 1.1073000028059095e-06,
"loss": 0.987,
"step": 255
},
{
"epoch": 0.9552238805970149,
"grad_norm": 1.5333052526002764,
"learning_rate": 1.1013599964337106e-06,
"loss": 0.8951,
"step": 256
},
{
"epoch": 0.9589552238805971,
"grad_norm": 4.771307269529906,
"learning_rate": 1.095416374351826e-06,
"loss": 1.2666,
"step": 257
},
{
"epoch": 0.9626865671641791,
"grad_norm": 2.717541175438304,
"learning_rate": 1.0894693485809014e-06,
"loss": 1.1109,
"step": 258
},
{
"epoch": 0.9664179104477612,
"grad_norm": 2.756698383274168,
"learning_rate": 1.0835191312629992e-06,
"loss": 1.129,
"step": 259
},
{
"epoch": 0.9701492537313433,
"grad_norm": 1.7033771070854546,
"learning_rate": 1.0775659346540303e-06,
"loss": 0.9603,
"step": 260
},
{
"epoch": 0.9738805970149254,
"grad_norm": 2.5967915673434034,
"learning_rate": 1.0716099711161832e-06,
"loss": 1.1943,
"step": 261
},
{
"epoch": 0.9776119402985075,
"grad_norm": 2.145466598370863,
"learning_rate": 1.0656514531103483e-06,
"loss": 0.8841,
"step": 262
},
{
"epoch": 0.9813432835820896,
"grad_norm": 2.1800126434020477,
"learning_rate": 1.0596905931885373e-06,
"loss": 0.9661,
"step": 263
},
{
"epoch": 0.9850746268656716,
"grad_norm": 2.8261681835210544,
"learning_rate": 1.0537276039863047e-06,
"loss": 1.1867,
"step": 264
},
{
"epoch": 0.9888059701492538,
"grad_norm": 3.2978247537112586,
"learning_rate": 1.04776269821516e-06,
"loss": 1.2103,
"step": 265
},
{
"epoch": 0.9925373134328358,
"grad_norm": 2.8052176780437064,
"learning_rate": 1.0417960886549798e-06,
"loss": 1.3141,
"step": 266
},
{
"epoch": 0.996268656716418,
"grad_norm": 2.5933472539580635,
"learning_rate": 1.035827988146418e-06,
"loss": 1.078,
"step": 267
},
{
"epoch": 1.0,
"grad_norm": 2.8520830177001395,
"learning_rate": 1.0298586095833151e-06,
"loss": 1.3273,
"step": 268
}
],
"logging_steps": 1,
"max_steps": 536,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 268,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 50952048476160.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}