Safetensors
Korean
new
reranker
korean
custom_code
ko-reranker-v1.1 / checkpoint-200 /trainer_state.json
sigridjineth's picture
Upload folder using huggingface_hub
1539793 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.96879875195008,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0998439937597504,
"grad_norm": 0.4140388733942774,
"learning_rate": 0.0,
"loss": 1.9149,
"step": 1
},
{
"epoch": 0.1996879875195008,
"grad_norm": 4.125320422125423,
"learning_rate": 0.0001,
"loss": 2.0842,
"step": 2
},
{
"epoch": 0.2995319812792512,
"grad_norm": 4.114561571796027,
"learning_rate": 0.0001,
"loss": 2.0845,
"step": 3
},
{
"epoch": 0.3993759750390016,
"grad_norm": 7.814135082802521,
"learning_rate": 9.94949494949495e-05,
"loss": 2.2529,
"step": 4
},
{
"epoch": 0.49921996879875197,
"grad_norm": 4.959578007928057,
"learning_rate": 9.8989898989899e-05,
"loss": 2.1802,
"step": 5
},
{
"epoch": 0.5990639625585024,
"grad_norm": 4.020980884960991,
"learning_rate": 9.848484848484849e-05,
"loss": 2.0996,
"step": 6
},
{
"epoch": 0.6989079563182528,
"grad_norm": 2.913148956691379,
"learning_rate": 9.797979797979798e-05,
"loss": 2.0739,
"step": 7
},
{
"epoch": 0.7987519500780031,
"grad_norm": 1.5904378694728827,
"learning_rate": 9.747474747474747e-05,
"loss": 2.0438,
"step": 8
},
{
"epoch": 0.8985959438377535,
"grad_norm": 1.1120971271683253,
"learning_rate": 9.696969696969698e-05,
"loss": 2.023,
"step": 9
},
{
"epoch": 0.9984399375975039,
"grad_norm": 1.4857926676190432,
"learning_rate": 9.646464646464647e-05,
"loss": 2.0256,
"step": 10
},
{
"epoch": 1.0982839313572543,
"grad_norm": 1.7514837650567026,
"learning_rate": 9.595959595959596e-05,
"loss": 2.0297,
"step": 11
},
{
"epoch": 1.1981279251170047,
"grad_norm": 1.3684063083659384,
"learning_rate": 9.545454545454546e-05,
"loss": 1.9984,
"step": 12
},
{
"epoch": 1.2979719188767551,
"grad_norm": 0.8391319984906789,
"learning_rate": 9.494949494949495e-05,
"loss": 1.9938,
"step": 13
},
{
"epoch": 1.3978159126365055,
"grad_norm": 0.7680152250335479,
"learning_rate": 9.444444444444444e-05,
"loss": 1.9984,
"step": 14
},
{
"epoch": 1.497659906396256,
"grad_norm": 1.1427900590537006,
"learning_rate": 9.393939393939395e-05,
"loss": 2.0059,
"step": 15
},
{
"epoch": 1.5975039001560063,
"grad_norm": 1.2852588832884364,
"learning_rate": 9.343434343434344e-05,
"loss": 2.0057,
"step": 16
},
{
"epoch": 1.6973478939157567,
"grad_norm": 0.8509981577726656,
"learning_rate": 9.292929292929293e-05,
"loss": 1.97,
"step": 17
},
{
"epoch": 1.797191887675507,
"grad_norm": 0.4374249257660765,
"learning_rate": 9.242424242424242e-05,
"loss": 1.9852,
"step": 18
},
{
"epoch": 1.8970358814352575,
"grad_norm": 1.006945747433108,
"learning_rate": 9.191919191919192e-05,
"loss": 1.9736,
"step": 19
},
{
"epoch": 1.9968798751950079,
"grad_norm": 1.1714326150671521,
"learning_rate": 9.141414141414141e-05,
"loss": 1.9866,
"step": 20
},
{
"epoch": 2.0967238689547583,
"grad_norm": 0.6697915843016325,
"learning_rate": 9.090909090909092e-05,
"loss": 1.9669,
"step": 21
},
{
"epoch": 2.1965678627145087,
"grad_norm": 0.43542954442572934,
"learning_rate": 9.040404040404041e-05,
"loss": 1.9596,
"step": 22
},
{
"epoch": 2.296411856474259,
"grad_norm": 0.8895989581186896,
"learning_rate": 8.98989898989899e-05,
"loss": 1.9777,
"step": 23
},
{
"epoch": 2.3962558502340094,
"grad_norm": 0.748473401890919,
"learning_rate": 8.93939393939394e-05,
"loss": 1.9828,
"step": 24
},
{
"epoch": 2.49609984399376,
"grad_norm": 0.4762840239068188,
"learning_rate": 8.888888888888889e-05,
"loss": 1.9863,
"step": 25
},
{
"epoch": 2.5959438377535102,
"grad_norm": 0.4634914120924797,
"learning_rate": 8.83838383838384e-05,
"loss": 1.9728,
"step": 26
},
{
"epoch": 2.6957878315132606,
"grad_norm": 0.576721007312459,
"learning_rate": 8.787878787878789e-05,
"loss": 1.9813,
"step": 27
},
{
"epoch": 2.795631825273011,
"grad_norm": 0.4717088615288276,
"learning_rate": 8.737373737373738e-05,
"loss": 1.9709,
"step": 28
},
{
"epoch": 2.8954758190327614,
"grad_norm": 0.5243076095653101,
"learning_rate": 8.686868686868688e-05,
"loss": 1.9889,
"step": 29
},
{
"epoch": 2.995319812792512,
"grad_norm": 0.35563844256479116,
"learning_rate": 8.636363636363637e-05,
"loss": 1.969,
"step": 30
},
{
"epoch": 3.095163806552262,
"grad_norm": 0.5040313506272054,
"learning_rate": 8.585858585858586e-05,
"loss": 1.9701,
"step": 31
},
{
"epoch": 3.1950078003120126,
"grad_norm": 0.5293887294443628,
"learning_rate": 8.535353535353535e-05,
"loss": 1.9774,
"step": 32
},
{
"epoch": 3.294851794071763,
"grad_norm": 0.33336016676106733,
"learning_rate": 8.484848484848486e-05,
"loss": 1.9702,
"step": 33
},
{
"epoch": 3.3946957878315134,
"grad_norm": 0.5156182664373749,
"learning_rate": 8.434343434343435e-05,
"loss": 1.9552,
"step": 34
},
{
"epoch": 3.4945397815912638,
"grad_norm": 0.410792592829029,
"learning_rate": 8.383838383838384e-05,
"loss": 1.9642,
"step": 35
},
{
"epoch": 3.594383775351014,
"grad_norm": 0.40267682408922495,
"learning_rate": 8.333333333333334e-05,
"loss": 1.9688,
"step": 36
},
{
"epoch": 3.6942277691107646,
"grad_norm": 0.3869359148412346,
"learning_rate": 8.282828282828283e-05,
"loss": 1.9733,
"step": 37
},
{
"epoch": 3.794071762870515,
"grad_norm": 0.37728712869432585,
"learning_rate": 8.232323232323233e-05,
"loss": 1.9648,
"step": 38
},
{
"epoch": 3.8939157566302653,
"grad_norm": 0.3922418131207954,
"learning_rate": 8.181818181818183e-05,
"loss": 1.9689,
"step": 39
},
{
"epoch": 3.9937597503900157,
"grad_norm": 0.26353046722639645,
"learning_rate": 8.131313131313132e-05,
"loss": 1.9727,
"step": 40
},
{
"epoch": 4.093603744149766,
"grad_norm": 0.3911091474488452,
"learning_rate": 8.080808080808081e-05,
"loss": 1.9631,
"step": 41
},
{
"epoch": 4.1934477379095165,
"grad_norm": 0.33402240826623614,
"learning_rate": 8.03030303030303e-05,
"loss": 1.9665,
"step": 42
},
{
"epoch": 4.2932917316692665,
"grad_norm": 0.34654808232868395,
"learning_rate": 7.97979797979798e-05,
"loss": 1.9646,
"step": 43
},
{
"epoch": 4.393135725429017,
"grad_norm": 0.3031078864703629,
"learning_rate": 7.92929292929293e-05,
"loss": 1.9693,
"step": 44
},
{
"epoch": 4.492979719188767,
"grad_norm": 0.35342072957234116,
"learning_rate": 7.878787878787879e-05,
"loss": 1.9688,
"step": 45
},
{
"epoch": 4.592823712948518,
"grad_norm": 0.3918161921811716,
"learning_rate": 7.828282828282829e-05,
"loss": 1.9609,
"step": 46
},
{
"epoch": 4.692667706708268,
"grad_norm": 0.24995683506017796,
"learning_rate": 7.777777777777778e-05,
"loss": 1.9515,
"step": 47
},
{
"epoch": 4.792511700468019,
"grad_norm": 0.3308078104166398,
"learning_rate": 7.727272727272727e-05,
"loss": 1.9607,
"step": 48
},
{
"epoch": 4.892355694227769,
"grad_norm": 0.3130926472973521,
"learning_rate": 7.676767676767676e-05,
"loss": 1.9699,
"step": 49
},
{
"epoch": 4.99219968798752,
"grad_norm": 0.30892356920484393,
"learning_rate": 7.626262626262627e-05,
"loss": 1.9645,
"step": 50
},
{
"epoch": 5.09204368174727,
"grad_norm": 0.2804202715276883,
"learning_rate": 7.575757575757576e-05,
"loss": 1.9569,
"step": 51
},
{
"epoch": 5.1918876755070205,
"grad_norm": 0.2789049399636327,
"learning_rate": 7.525252525252525e-05,
"loss": 1.9585,
"step": 52
},
{
"epoch": 5.29173166926677,
"grad_norm": 0.2906929505804403,
"learning_rate": 7.474747474747475e-05,
"loss": 1.9565,
"step": 53
},
{
"epoch": 5.391575663026521,
"grad_norm": 0.2033727950080347,
"learning_rate": 7.424242424242424e-05,
"loss": 1.9755,
"step": 54
},
{
"epoch": 5.491419656786271,
"grad_norm": 0.31364461369416163,
"learning_rate": 7.373737373737373e-05,
"loss": 1.9647,
"step": 55
},
{
"epoch": 5.591263650546022,
"grad_norm": 0.2531087381531638,
"learning_rate": 7.323232323232324e-05,
"loss": 1.9578,
"step": 56
},
{
"epoch": 5.691107644305772,
"grad_norm": 0.23764498764830225,
"learning_rate": 7.272727272727273e-05,
"loss": 1.9617,
"step": 57
},
{
"epoch": 5.790951638065523,
"grad_norm": 0.24888591334854687,
"learning_rate": 7.222222222222222e-05,
"loss": 1.963,
"step": 58
},
{
"epoch": 5.890795631825273,
"grad_norm": 0.2647075657405339,
"learning_rate": 7.171717171717171e-05,
"loss": 1.9685,
"step": 59
},
{
"epoch": 5.990639625585024,
"grad_norm": 0.27820470985704615,
"learning_rate": 7.121212121212121e-05,
"loss": 1.9654,
"step": 60
},
{
"epoch": 6.090483619344774,
"grad_norm": 0.20068946885468097,
"learning_rate": 7.07070707070707e-05,
"loss": 1.9667,
"step": 61
},
{
"epoch": 6.190327613104524,
"grad_norm": 0.25026234630326394,
"learning_rate": 7.020202020202021e-05,
"loss": 1.9542,
"step": 62
},
{
"epoch": 6.290171606864274,
"grad_norm": 0.22856925269883635,
"learning_rate": 6.96969696969697e-05,
"loss": 1.9573,
"step": 63
},
{
"epoch": 6.390015600624025,
"grad_norm": 0.2392183076591563,
"learning_rate": 6.91919191919192e-05,
"loss": 1.9647,
"step": 64
},
{
"epoch": 6.489859594383775,
"grad_norm": 0.20384525102843132,
"learning_rate": 6.86868686868687e-05,
"loss": 1.9628,
"step": 65
},
{
"epoch": 6.589703588143526,
"grad_norm": 0.23941897200051984,
"learning_rate": 6.818181818181818e-05,
"loss": 1.9667,
"step": 66
},
{
"epoch": 6.689547581903276,
"grad_norm": 0.20375278551444306,
"learning_rate": 6.767676767676769e-05,
"loss": 1.9572,
"step": 67
},
{
"epoch": 6.789391575663027,
"grad_norm": 0.20727005267599333,
"learning_rate": 6.717171717171718e-05,
"loss": 1.9581,
"step": 68
},
{
"epoch": 6.889235569422777,
"grad_norm": 0.22300809533132504,
"learning_rate": 6.666666666666667e-05,
"loss": 1.9693,
"step": 69
},
{
"epoch": 6.9890795631825275,
"grad_norm": 0.21742318730398613,
"learning_rate": 6.616161616161617e-05,
"loss": 1.9656,
"step": 70
},
{
"epoch": 7.0889235569422775,
"grad_norm": 0.20343822391223,
"learning_rate": 6.565656565656566e-05,
"loss": 1.9656,
"step": 71
},
{
"epoch": 7.188767550702028,
"grad_norm": 0.2364200066637671,
"learning_rate": 6.515151515151516e-05,
"loss": 1.95,
"step": 72
},
{
"epoch": 7.288611544461778,
"grad_norm": 0.18261048615751524,
"learning_rate": 6.464646464646466e-05,
"loss": 1.9538,
"step": 73
},
{
"epoch": 7.388455538221529,
"grad_norm": 0.24533487813163474,
"learning_rate": 6.414141414141415e-05,
"loss": 1.952,
"step": 74
},
{
"epoch": 7.488299531981279,
"grad_norm": 0.2539612930496735,
"learning_rate": 6.363636363636364e-05,
"loss": 1.9598,
"step": 75
},
{
"epoch": 7.58814352574103,
"grad_norm": 0.2991457603613546,
"learning_rate": 6.313131313131313e-05,
"loss": 1.9733,
"step": 76
},
{
"epoch": 7.68798751950078,
"grad_norm": 0.2209105824741669,
"learning_rate": 6.262626262626264e-05,
"loss": 1.9531,
"step": 77
},
{
"epoch": 7.787831513260531,
"grad_norm": 0.31698310918964695,
"learning_rate": 6.212121212121213e-05,
"loss": 1.964,
"step": 78
},
{
"epoch": 7.887675507020281,
"grad_norm": 0.17584276182725114,
"learning_rate": 6.161616161616162e-05,
"loss": 1.9644,
"step": 79
},
{
"epoch": 7.9875195007800315,
"grad_norm": 0.29996919622824225,
"learning_rate": 6.111111111111112e-05,
"loss": 1.9555,
"step": 80
},
{
"epoch": 8.087363494539781,
"grad_norm": 0.2406502202676367,
"learning_rate": 6.060606060606061e-05,
"loss": 1.9554,
"step": 81
},
{
"epoch": 8.187207488299531,
"grad_norm": 0.32705142732170855,
"learning_rate": 6.01010101010101e-05,
"loss": 1.9517,
"step": 82
},
{
"epoch": 8.287051482059283,
"grad_norm": 0.27249925952338305,
"learning_rate": 5.959595959595959e-05,
"loss": 1.9474,
"step": 83
},
{
"epoch": 8.386895475819033,
"grad_norm": 0.29448831027669287,
"learning_rate": 5.90909090909091e-05,
"loss": 1.9459,
"step": 84
},
{
"epoch": 8.486739469578783,
"grad_norm": 0.29998154037028857,
"learning_rate": 5.858585858585859e-05,
"loss": 1.9606,
"step": 85
},
{
"epoch": 8.586583463338533,
"grad_norm": 0.23153724936859055,
"learning_rate": 5.808080808080808e-05,
"loss": 1.9598,
"step": 86
},
{
"epoch": 8.686427457098285,
"grad_norm": 0.22081595887056477,
"learning_rate": 5.757575757575758e-05,
"loss": 1.9586,
"step": 87
},
{
"epoch": 8.786271450858035,
"grad_norm": 0.19177670537863922,
"learning_rate": 5.707070707070707e-05,
"loss": 1.9715,
"step": 88
},
{
"epoch": 8.886115444617785,
"grad_norm": 0.25725928107907137,
"learning_rate": 5.6565656565656563e-05,
"loss": 1.9602,
"step": 89
},
{
"epoch": 8.985959438377535,
"grad_norm": 0.26044371305524344,
"learning_rate": 5.606060606060606e-05,
"loss": 1.9607,
"step": 90
},
{
"epoch": 9.085803432137286,
"grad_norm": 0.23728151561491595,
"learning_rate": 5.555555555555556e-05,
"loss": 1.9588,
"step": 91
},
{
"epoch": 9.185647425897036,
"grad_norm": 0.20354348868729488,
"learning_rate": 5.5050505050505056e-05,
"loss": 1.9492,
"step": 92
},
{
"epoch": 9.285491419656786,
"grad_norm": 0.18672087839741056,
"learning_rate": 5.4545454545454546e-05,
"loss": 1.9457,
"step": 93
},
{
"epoch": 9.385335413416536,
"grad_norm": 0.1939858201242329,
"learning_rate": 5.4040404040404044e-05,
"loss": 1.9453,
"step": 94
},
{
"epoch": 9.485179407176288,
"grad_norm": 0.19172060706771135,
"learning_rate": 5.353535353535354e-05,
"loss": 1.958,
"step": 95
},
{
"epoch": 9.585023400936038,
"grad_norm": 0.1837920882880991,
"learning_rate": 5.303030303030303e-05,
"loss": 1.9577,
"step": 96
},
{
"epoch": 9.684867394695788,
"grad_norm": 0.2162949878555464,
"learning_rate": 5.2525252525252536e-05,
"loss": 1.9622,
"step": 97
},
{
"epoch": 9.784711388455538,
"grad_norm": 0.19325381586186333,
"learning_rate": 5.2020202020202026e-05,
"loss": 1.9433,
"step": 98
},
{
"epoch": 9.88455538221529,
"grad_norm": 0.2018142831658023,
"learning_rate": 5.151515151515152e-05,
"loss": 1.9605,
"step": 99
},
{
"epoch": 9.98439937597504,
"grad_norm": 0.176671565601027,
"learning_rate": 5.101010101010101e-05,
"loss": 1.9578,
"step": 100
},
{
"epoch": 10.08424336973479,
"grad_norm": 0.2117788085352089,
"learning_rate": 5.050505050505051e-05,
"loss": 1.9478,
"step": 101
},
{
"epoch": 10.18408736349454,
"grad_norm": 0.1816135304249716,
"learning_rate": 5e-05,
"loss": 1.9423,
"step": 102
},
{
"epoch": 10.283931357254291,
"grad_norm": 0.2680310363226074,
"learning_rate": 4.94949494949495e-05,
"loss": 1.9519,
"step": 103
},
{
"epoch": 10.383775351014041,
"grad_norm": 0.17934299698555412,
"learning_rate": 4.898989898989899e-05,
"loss": 1.9625,
"step": 104
},
{
"epoch": 10.48361934477379,
"grad_norm": 0.19786074542682824,
"learning_rate": 4.848484848484849e-05,
"loss": 1.95,
"step": 105
},
{
"epoch": 10.58346333853354,
"grad_norm": 0.17490489580858018,
"learning_rate": 4.797979797979798e-05,
"loss": 1.9513,
"step": 106
},
{
"epoch": 10.683307332293293,
"grad_norm": 0.224513887757472,
"learning_rate": 4.7474747474747476e-05,
"loss": 1.9499,
"step": 107
},
{
"epoch": 10.783151326053042,
"grad_norm": 0.16993980203530532,
"learning_rate": 4.696969696969697e-05,
"loss": 1.944,
"step": 108
},
{
"epoch": 10.882995319812792,
"grad_norm": 0.18436224376063975,
"learning_rate": 4.6464646464646464e-05,
"loss": 1.9494,
"step": 109
},
{
"epoch": 10.982839313572542,
"grad_norm": 0.1858801208244774,
"learning_rate": 4.595959595959596e-05,
"loss": 1.9504,
"step": 110
},
{
"epoch": 11.082683307332294,
"grad_norm": 0.21397140437157122,
"learning_rate": 4.545454545454546e-05,
"loss": 1.9467,
"step": 111
},
{
"epoch": 11.182527301092044,
"grad_norm": 0.16934479906947233,
"learning_rate": 4.494949494949495e-05,
"loss": 1.9475,
"step": 112
},
{
"epoch": 11.282371294851794,
"grad_norm": 0.17710883760980722,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.9418,
"step": 113
},
{
"epoch": 11.382215288611544,
"grad_norm": 0.2278025006688675,
"learning_rate": 4.3939393939393944e-05,
"loss": 1.9456,
"step": 114
},
{
"epoch": 11.482059282371296,
"grad_norm": 0.18727166458531316,
"learning_rate": 4.343434343434344e-05,
"loss": 1.9408,
"step": 115
},
{
"epoch": 11.581903276131046,
"grad_norm": 0.17348080665741175,
"learning_rate": 4.292929292929293e-05,
"loss": 1.9367,
"step": 116
},
{
"epoch": 11.681747269890796,
"grad_norm": 0.21559975863343248,
"learning_rate": 4.242424242424243e-05,
"loss": 1.9509,
"step": 117
},
{
"epoch": 11.781591263650546,
"grad_norm": 0.20515384184563593,
"learning_rate": 4.191919191919192e-05,
"loss": 1.9503,
"step": 118
},
{
"epoch": 11.881435257410295,
"grad_norm": 0.17579996751101729,
"learning_rate": 4.141414141414142e-05,
"loss": 1.9443,
"step": 119
},
{
"epoch": 11.981279251170047,
"grad_norm": 0.1870399234707776,
"learning_rate": 4.0909090909090915e-05,
"loss": 1.9507,
"step": 120
},
{
"epoch": 12.081123244929797,
"grad_norm": 0.2323975590399996,
"learning_rate": 4.0404040404040405e-05,
"loss": 1.9486,
"step": 121
},
{
"epoch": 12.180967238689547,
"grad_norm": 0.17332911391024705,
"learning_rate": 3.98989898989899e-05,
"loss": 1.9441,
"step": 122
},
{
"epoch": 12.280811232449299,
"grad_norm": 0.23886491083540215,
"learning_rate": 3.939393939393939e-05,
"loss": 1.9489,
"step": 123
},
{
"epoch": 12.380655226209049,
"grad_norm": 0.192192583869745,
"learning_rate": 3.888888888888889e-05,
"loss": 1.936,
"step": 124
},
{
"epoch": 12.480499219968799,
"grad_norm": 0.24070020033146947,
"learning_rate": 3.838383838383838e-05,
"loss": 1.9363,
"step": 125
},
{
"epoch": 12.580343213728549,
"grad_norm": 0.17061145664967614,
"learning_rate": 3.787878787878788e-05,
"loss": 1.947,
"step": 126
},
{
"epoch": 12.680187207488299,
"grad_norm": 0.20420044689274344,
"learning_rate": 3.7373737373737376e-05,
"loss": 1.9462,
"step": 127
},
{
"epoch": 12.78003120124805,
"grad_norm": 0.16640664781155742,
"learning_rate": 3.686868686868687e-05,
"loss": 1.9404,
"step": 128
},
{
"epoch": 12.8798751950078,
"grad_norm": 0.17534875646136103,
"learning_rate": 3.6363636363636364e-05,
"loss": 1.9441,
"step": 129
},
{
"epoch": 12.97971918876755,
"grad_norm": 0.1881647742956635,
"learning_rate": 3.5858585858585855e-05,
"loss": 1.9452,
"step": 130
},
{
"epoch": 13.0795631825273,
"grad_norm": 0.21130090774448568,
"learning_rate": 3.535353535353535e-05,
"loss": 1.938,
"step": 131
},
{
"epoch": 13.179407176287052,
"grad_norm": 0.19012207225624486,
"learning_rate": 3.484848484848485e-05,
"loss": 1.93,
"step": 132
},
{
"epoch": 13.279251170046802,
"grad_norm": 0.19535583015453165,
"learning_rate": 3.434343434343435e-05,
"loss": 1.9423,
"step": 133
},
{
"epoch": 13.379095163806552,
"grad_norm": 0.1972934873185412,
"learning_rate": 3.3838383838383844e-05,
"loss": 1.9449,
"step": 134
},
{
"epoch": 13.478939157566302,
"grad_norm": 0.21172258190614646,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.9423,
"step": 135
},
{
"epoch": 13.578783151326054,
"grad_norm": 0.20243808248600392,
"learning_rate": 3.282828282828283e-05,
"loss": 1.9454,
"step": 136
},
{
"epoch": 13.678627145085803,
"grad_norm": 0.29468220957824104,
"learning_rate": 3.232323232323233e-05,
"loss": 1.9329,
"step": 137
},
{
"epoch": 13.778471138845553,
"grad_norm": 0.1852836649334086,
"learning_rate": 3.181818181818182e-05,
"loss": 1.9397,
"step": 138
},
{
"epoch": 13.878315132605305,
"grad_norm": 0.17635021846243693,
"learning_rate": 3.131313131313132e-05,
"loss": 1.9414,
"step": 139
},
{
"epoch": 13.978159126365055,
"grad_norm": 0.1837620268343685,
"learning_rate": 3.080808080808081e-05,
"loss": 1.9265,
"step": 140
},
{
"epoch": 14.078003120124805,
"grad_norm": 0.1851416429157977,
"learning_rate": 3.0303030303030306e-05,
"loss": 1.938,
"step": 141
},
{
"epoch": 14.177847113884555,
"grad_norm": 0.18177436704033564,
"learning_rate": 2.9797979797979796e-05,
"loss": 1.9338,
"step": 142
},
{
"epoch": 14.277691107644305,
"grad_norm": 0.20249599488147646,
"learning_rate": 2.9292929292929294e-05,
"loss": 1.943,
"step": 143
},
{
"epoch": 14.377535101404057,
"grad_norm": 0.1914943764672633,
"learning_rate": 2.878787878787879e-05,
"loss": 1.9381,
"step": 144
},
{
"epoch": 14.477379095163807,
"grad_norm": 0.18144339446500468,
"learning_rate": 2.8282828282828282e-05,
"loss": 1.9493,
"step": 145
},
{
"epoch": 14.577223088923557,
"grad_norm": 0.22871591479507436,
"learning_rate": 2.777777777777778e-05,
"loss": 1.9394,
"step": 146
},
{
"epoch": 14.677067082683307,
"grad_norm": 0.2409736531836878,
"learning_rate": 2.7272727272727273e-05,
"loss": 1.9363,
"step": 147
},
{
"epoch": 14.776911076443058,
"grad_norm": 0.21702411701682794,
"learning_rate": 2.676767676767677e-05,
"loss": 1.9324,
"step": 148
},
{
"epoch": 14.876755070202808,
"grad_norm": 0.186963824720383,
"learning_rate": 2.6262626262626268e-05,
"loss": 1.9254,
"step": 149
},
{
"epoch": 14.976599063962558,
"grad_norm": 0.20551876684974787,
"learning_rate": 2.575757575757576e-05,
"loss": 1.9385,
"step": 150
},
{
"epoch": 15.076443057722308,
"grad_norm": 0.17794734645273458,
"learning_rate": 2.5252525252525256e-05,
"loss": 1.935,
"step": 151
},
{
"epoch": 15.17628705148206,
"grad_norm": 0.19787955354426204,
"learning_rate": 2.474747474747475e-05,
"loss": 1.9286,
"step": 152
},
{
"epoch": 15.27613104524181,
"grad_norm": 0.21663975391838738,
"learning_rate": 2.4242424242424244e-05,
"loss": 1.9274,
"step": 153
},
{
"epoch": 15.37597503900156,
"grad_norm": 0.19056508068402894,
"learning_rate": 2.3737373737373738e-05,
"loss": 1.9328,
"step": 154
},
{
"epoch": 15.47581903276131,
"grad_norm": 0.20643529046597323,
"learning_rate": 2.3232323232323232e-05,
"loss": 1.9374,
"step": 155
},
{
"epoch": 15.575663026521061,
"grad_norm": 0.17428582721990332,
"learning_rate": 2.272727272727273e-05,
"loss": 1.9435,
"step": 156
},
{
"epoch": 15.675507020280811,
"grad_norm": 0.17915807350384474,
"learning_rate": 2.2222222222222223e-05,
"loss": 1.9342,
"step": 157
},
{
"epoch": 15.775351014040561,
"grad_norm": 0.17934386940217817,
"learning_rate": 2.171717171717172e-05,
"loss": 1.9252,
"step": 158
},
{
"epoch": 15.875195007800311,
"grad_norm": 0.16971494417624172,
"learning_rate": 2.1212121212121215e-05,
"loss": 1.9333,
"step": 159
},
{
"epoch": 15.975039001560063,
"grad_norm": 0.1710725442382166,
"learning_rate": 2.070707070707071e-05,
"loss": 1.9397,
"step": 160
},
{
"epoch": 16.07488299531981,
"grad_norm": 0.16048331708079347,
"learning_rate": 2.0202020202020203e-05,
"loss": 1.9354,
"step": 161
},
{
"epoch": 16.174726989079563,
"grad_norm": 0.2209212482793433,
"learning_rate": 1.9696969696969697e-05,
"loss": 1.9572,
"step": 162
},
{
"epoch": 16.274570982839315,
"grad_norm": 0.17292517371584637,
"learning_rate": 1.919191919191919e-05,
"loss": 1.9384,
"step": 163
},
{
"epoch": 16.374414976599063,
"grad_norm": 0.1756696399704993,
"learning_rate": 1.8686868686868688e-05,
"loss": 1.9287,
"step": 164
},
{
"epoch": 16.474258970358814,
"grad_norm": 0.193814973934712,
"learning_rate": 1.8181818181818182e-05,
"loss": 1.9285,
"step": 165
},
{
"epoch": 16.574102964118566,
"grad_norm": 0.21108116449806094,
"learning_rate": 1.7676767676767676e-05,
"loss": 1.9249,
"step": 166
},
{
"epoch": 16.673946957878314,
"grad_norm": 0.164152325154632,
"learning_rate": 1.7171717171717173e-05,
"loss": 1.9335,
"step": 167
},
{
"epoch": 16.773790951638066,
"grad_norm": 0.1934976757474289,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.9344,
"step": 168
},
{
"epoch": 16.873634945397814,
"grad_norm": 0.17861559674997443,
"learning_rate": 1.6161616161616165e-05,
"loss": 1.9315,
"step": 169
},
{
"epoch": 16.973478939157566,
"grad_norm": 0.16812713496720977,
"learning_rate": 1.565656565656566e-05,
"loss": 1.9278,
"step": 170
},
{
"epoch": 17.073322932917318,
"grad_norm": 0.19243202935397094,
"learning_rate": 1.5151515151515153e-05,
"loss": 1.939,
"step": 171
},
{
"epoch": 17.173166926677066,
"grad_norm": 0.16546322416204856,
"learning_rate": 1.4646464646464647e-05,
"loss": 1.9295,
"step": 172
},
{
"epoch": 17.273010920436818,
"grad_norm": 0.19615095413628908,
"learning_rate": 1.4141414141414141e-05,
"loss": 1.9357,
"step": 173
},
{
"epoch": 17.37285491419657,
"grad_norm": 0.16562858231287156,
"learning_rate": 1.3636363636363637e-05,
"loss": 1.9372,
"step": 174
},
{
"epoch": 17.472698907956318,
"grad_norm": 0.1755423564949021,
"learning_rate": 1.3131313131313134e-05,
"loss": 1.9208,
"step": 175
},
{
"epoch": 17.57254290171607,
"grad_norm": 0.16572591523274388,
"learning_rate": 1.2626262626262628e-05,
"loss": 1.9196,
"step": 176
},
{
"epoch": 17.672386895475817,
"grad_norm": 0.16066050812369387,
"learning_rate": 1.2121212121212122e-05,
"loss": 1.9379,
"step": 177
},
{
"epoch": 17.77223088923557,
"grad_norm": 0.18230307180057742,
"learning_rate": 1.1616161616161616e-05,
"loss": 1.9344,
"step": 178
},
{
"epoch": 17.87207488299532,
"grad_norm": 0.16147840026521357,
"learning_rate": 1.1111111111111112e-05,
"loss": 1.9249,
"step": 179
},
{
"epoch": 17.97191887675507,
"grad_norm": 0.17234298543336798,
"learning_rate": 1.0606060606060607e-05,
"loss": 1.9341,
"step": 180
},
{
"epoch": 18.07176287051482,
"grad_norm": 0.16952419332241464,
"learning_rate": 1.0101010101010101e-05,
"loss": 1.9382,
"step": 181
},
{
"epoch": 18.171606864274573,
"grad_norm": 0.17503197241676455,
"learning_rate": 9.595959595959595e-06,
"loss": 1.9277,
"step": 182
},
{
"epoch": 18.27145085803432,
"grad_norm": 0.16018657280969506,
"learning_rate": 9.090909090909091e-06,
"loss": 1.9259,
"step": 183
},
{
"epoch": 18.371294851794072,
"grad_norm": 0.16577134954028483,
"learning_rate": 8.585858585858587e-06,
"loss": 1.9391,
"step": 184
},
{
"epoch": 18.47113884555382,
"grad_norm": 0.1758462044127833,
"learning_rate": 8.080808080808082e-06,
"loss": 1.9316,
"step": 185
},
{
"epoch": 18.570982839313572,
"grad_norm": 0.16928715932805172,
"learning_rate": 7.5757575757575764e-06,
"loss": 1.9218,
"step": 186
},
{
"epoch": 18.670826833073324,
"grad_norm": 0.16185874983512785,
"learning_rate": 7.0707070707070704e-06,
"loss": 1.9244,
"step": 187
},
{
"epoch": 18.770670826833072,
"grad_norm": 0.16445906712178507,
"learning_rate": 6.565656565656567e-06,
"loss": 1.9425,
"step": 188
},
{
"epoch": 18.870514820592824,
"grad_norm": 0.16313460189322437,
"learning_rate": 6.060606060606061e-06,
"loss": 1.9336,
"step": 189
},
{
"epoch": 18.970358814352576,
"grad_norm": 0.15990081630753986,
"learning_rate": 5.555555555555556e-06,
"loss": 1.9178,
"step": 190
},
{
"epoch": 19.070202808112324,
"grad_norm": 0.16547636636850527,
"learning_rate": 5.050505050505051e-06,
"loss": 1.9281,
"step": 191
},
{
"epoch": 19.170046801872076,
"grad_norm": 0.1625270231867559,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.9348,
"step": 192
},
{
"epoch": 19.269890795631824,
"grad_norm": 0.16385675767663568,
"learning_rate": 4.040404040404041e-06,
"loss": 1.9305,
"step": 193
},
{
"epoch": 19.369734789391575,
"grad_norm": 0.16718542619114216,
"learning_rate": 3.5353535353535352e-06,
"loss": 1.9376,
"step": 194
},
{
"epoch": 19.469578783151327,
"grad_norm": 0.16595125072244407,
"learning_rate": 3.0303030303030305e-06,
"loss": 1.9264,
"step": 195
},
{
"epoch": 19.569422776911075,
"grad_norm": 0.16912445317015737,
"learning_rate": 2.5252525252525253e-06,
"loss": 1.9252,
"step": 196
},
{
"epoch": 19.669266770670827,
"grad_norm": 0.15257787442711698,
"learning_rate": 2.0202020202020206e-06,
"loss": 1.9312,
"step": 197
},
{
"epoch": 19.76911076443058,
"grad_norm": 0.17270934725449602,
"learning_rate": 1.5151515151515152e-06,
"loss": 1.9241,
"step": 198
},
{
"epoch": 19.868954758190327,
"grad_norm": 0.16771403116909167,
"learning_rate": 1.0101010101010103e-06,
"loss": 1.9342,
"step": 199
},
{
"epoch": 19.96879875195008,
"grad_norm": 0.17132458008674775,
"learning_rate": 5.050505050505052e-07,
"loss": 1.9347,
"step": 200
}
],
"logging_steps": 1.0,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}