opt-350m-ko-ckpt-200 / trainer_state.json
thisisHJLee's picture
Upload 11 files
74f33c6 verified
raw
history blame contribute delete
No virus
32.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7414272474513438,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0037071362372567192,
"grad_norm": 167957.515625,
"learning_rate": 0.0003992,
"loss": 10.791,
"step": 1
},
{
"epoch": 0.0074142724745134385,
"grad_norm": 109901.1953125,
"learning_rate": 0.00039840000000000003,
"loss": 10.4195,
"step": 2
},
{
"epoch": 0.011121408711770158,
"grad_norm": 101352.265625,
"learning_rate": 0.0003976,
"loss": 10.0242,
"step": 3
},
{
"epoch": 0.014828544949026877,
"grad_norm": 105144.921875,
"learning_rate": 0.0003968,
"loss": 9.5684,
"step": 4
},
{
"epoch": 0.018535681186283594,
"grad_norm": 107496.6640625,
"learning_rate": 0.00039600000000000003,
"loss": 9.1775,
"step": 5
},
{
"epoch": 0.022242817423540315,
"grad_norm": 99089.9453125,
"learning_rate": 0.0003952,
"loss": 8.9319,
"step": 6
},
{
"epoch": 0.025949953660797033,
"grad_norm": 91537.4765625,
"learning_rate": 0.0003944,
"loss": 8.4907,
"step": 7
},
{
"epoch": 0.029657089898053754,
"grad_norm": 113517.7265625,
"learning_rate": 0.0003936,
"loss": 8.1206,
"step": 8
},
{
"epoch": 0.033364226135310475,
"grad_norm": 120697.0546875,
"learning_rate": 0.0003928,
"loss": 8.0377,
"step": 9
},
{
"epoch": 0.03707136237256719,
"grad_norm": 115909.0546875,
"learning_rate": 0.000392,
"loss": 7.9754,
"step": 10
},
{
"epoch": 0.04077849860982391,
"grad_norm": 116857.3359375,
"learning_rate": 0.0003912,
"loss": 7.8788,
"step": 11
},
{
"epoch": 0.04448563484708063,
"grad_norm": 101768.7109375,
"learning_rate": 0.0003904,
"loss": 7.8914,
"step": 12
},
{
"epoch": 0.04819277108433735,
"grad_norm": 101978.6015625,
"learning_rate": 0.0003896,
"loss": 7.8178,
"step": 13
},
{
"epoch": 0.051899907321594066,
"grad_norm": 90827.578125,
"learning_rate": 0.0003888,
"loss": 7.85,
"step": 14
},
{
"epoch": 0.05560704355885079,
"grad_norm": 82672.1640625,
"learning_rate": 0.000388,
"loss": 7.8152,
"step": 15
},
{
"epoch": 0.05931417979610751,
"grad_norm": 65482.09375,
"learning_rate": 0.00038720000000000003,
"loss": 7.8131,
"step": 16
},
{
"epoch": 0.06302131603336422,
"grad_norm": 55323.29296875,
"learning_rate": 0.0003864,
"loss": 7.6994,
"step": 17
},
{
"epoch": 0.06672845227062095,
"grad_norm": 94588.7109375,
"learning_rate": 0.0003856,
"loss": 7.8545,
"step": 18
},
{
"epoch": 0.07043558850787766,
"grad_norm": 50202.546875,
"learning_rate": 0.00038480000000000003,
"loss": 7.75,
"step": 19
},
{
"epoch": 0.07414272474513438,
"grad_norm": 48727.80859375,
"learning_rate": 0.000384,
"loss": 7.7449,
"step": 20
},
{
"epoch": 0.0778498609823911,
"grad_norm": 53795.23046875,
"learning_rate": 0.0003832,
"loss": 7.702,
"step": 21
},
{
"epoch": 0.08155699721964782,
"grad_norm": 55052.234375,
"learning_rate": 0.0003824,
"loss": 7.7048,
"step": 22
},
{
"epoch": 0.08526413345690455,
"grad_norm": 35977.5625,
"learning_rate": 0.0003816,
"loss": 7.7986,
"step": 23
},
{
"epoch": 0.08897126969416126,
"grad_norm": 55099.60546875,
"learning_rate": 0.0003808,
"loss": 7.8071,
"step": 24
},
{
"epoch": 0.09267840593141798,
"grad_norm": 34977.36328125,
"learning_rate": 0.00038,
"loss": 7.8473,
"step": 25
},
{
"epoch": 0.0963855421686747,
"grad_norm": 35271.6640625,
"learning_rate": 0.0003792,
"loss": 7.7099,
"step": 26
},
{
"epoch": 0.10009267840593142,
"grad_norm": 44887.39453125,
"learning_rate": 0.0003784,
"loss": 7.617,
"step": 27
},
{
"epoch": 0.10379981464318813,
"grad_norm": 41191.33203125,
"learning_rate": 0.0003776,
"loss": 7.697,
"step": 28
},
{
"epoch": 0.10750695088044486,
"grad_norm": 45202.97265625,
"learning_rate": 0.0003768,
"loss": 7.7264,
"step": 29
},
{
"epoch": 0.11121408711770157,
"grad_norm": 44944.65234375,
"learning_rate": 0.000376,
"loss": 7.7159,
"step": 30
},
{
"epoch": 0.11492122335495829,
"grad_norm": 34502.83203125,
"learning_rate": 0.0003752,
"loss": 7.7213,
"step": 31
},
{
"epoch": 0.11862835959221502,
"grad_norm": 38415.63671875,
"learning_rate": 0.00037440000000000005,
"loss": 7.6674,
"step": 32
},
{
"epoch": 0.12233549582947173,
"grad_norm": 34140.18359375,
"learning_rate": 0.00037360000000000003,
"loss": 7.6627,
"step": 33
},
{
"epoch": 0.12604263206672844,
"grad_norm": 27067.009765625,
"learning_rate": 0.00037280000000000006,
"loss": 7.6678,
"step": 34
},
{
"epoch": 0.12974976830398516,
"grad_norm": 34192.23828125,
"learning_rate": 0.00037200000000000004,
"loss": 7.7438,
"step": 35
},
{
"epoch": 0.1334569045412419,
"grad_norm": 42940.6953125,
"learning_rate": 0.0003712,
"loss": 7.6559,
"step": 36
},
{
"epoch": 0.1371640407784986,
"grad_norm": 28908.26171875,
"learning_rate": 0.00037040000000000006,
"loss": 7.6763,
"step": 37
},
{
"epoch": 0.14087117701575533,
"grad_norm": 46989.23046875,
"learning_rate": 0.00036960000000000004,
"loss": 7.6483,
"step": 38
},
{
"epoch": 0.14457831325301204,
"grad_norm": 38628.00390625,
"learning_rate": 0.0003688,
"loss": 7.5813,
"step": 39
},
{
"epoch": 0.14828544949026876,
"grad_norm": 26901.994140625,
"learning_rate": 0.00036800000000000005,
"loss": 7.7328,
"step": 40
},
{
"epoch": 0.1519925857275255,
"grad_norm": 55413.51953125,
"learning_rate": 0.00036720000000000004,
"loss": 7.5977,
"step": 41
},
{
"epoch": 0.1556997219647822,
"grad_norm": 38922.68359375,
"learning_rate": 0.0003664,
"loss": 7.6575,
"step": 42
},
{
"epoch": 0.15940685820203893,
"grad_norm": 49835.87109375,
"learning_rate": 0.00036560000000000005,
"loss": 7.7382,
"step": 43
},
{
"epoch": 0.16311399443929564,
"grad_norm": 41342.8515625,
"learning_rate": 0.00036480000000000003,
"loss": 7.7068,
"step": 44
},
{
"epoch": 0.16682113067655235,
"grad_norm": 38896.15625,
"learning_rate": 0.000364,
"loss": 7.6614,
"step": 45
},
{
"epoch": 0.1705282669138091,
"grad_norm": 29027.955078125,
"learning_rate": 0.00036320000000000005,
"loss": 7.728,
"step": 46
},
{
"epoch": 0.1742354031510658,
"grad_norm": 33758.0859375,
"learning_rate": 0.0003624,
"loss": 7.7392,
"step": 47
},
{
"epoch": 0.17794253938832252,
"grad_norm": 29002.869140625,
"learning_rate": 0.0003616,
"loss": 7.666,
"step": 48
},
{
"epoch": 0.18164967562557924,
"grad_norm": 33393.12890625,
"learning_rate": 0.00036080000000000004,
"loss": 7.6067,
"step": 49
},
{
"epoch": 0.18535681186283595,
"grad_norm": 39193.51171875,
"learning_rate": 0.00036,
"loss": 7.7868,
"step": 50
},
{
"epoch": 0.18906394810009267,
"grad_norm": 25982.78125,
"learning_rate": 0.0003592,
"loss": 7.7189,
"step": 51
},
{
"epoch": 0.1927710843373494,
"grad_norm": 28694.505859375,
"learning_rate": 0.00035840000000000004,
"loss": 7.6999,
"step": 52
},
{
"epoch": 0.19647822057460612,
"grad_norm": 26356.8828125,
"learning_rate": 0.0003576,
"loss": 7.712,
"step": 53
},
{
"epoch": 0.20018535681186284,
"grad_norm": 25880.298828125,
"learning_rate": 0.0003568,
"loss": 7.7015,
"step": 54
},
{
"epoch": 0.20389249304911955,
"grad_norm": 23557.111328125,
"learning_rate": 0.00035600000000000003,
"loss": 7.6849,
"step": 55
},
{
"epoch": 0.20759962928637626,
"grad_norm": 31365.33203125,
"learning_rate": 0.0003552,
"loss": 7.7333,
"step": 56
},
{
"epoch": 0.211306765523633,
"grad_norm": 31506.552734375,
"learning_rate": 0.0003544,
"loss": 7.7317,
"step": 57
},
{
"epoch": 0.21501390176088972,
"grad_norm": 22261.244140625,
"learning_rate": 0.00035360000000000003,
"loss": 7.6978,
"step": 58
},
{
"epoch": 0.21872103799814643,
"grad_norm": 36267.4921875,
"learning_rate": 0.0003528,
"loss": 7.7125,
"step": 59
},
{
"epoch": 0.22242817423540315,
"grad_norm": 29624.087890625,
"learning_rate": 0.00035200000000000005,
"loss": 7.734,
"step": 60
},
{
"epoch": 0.22613531047265986,
"grad_norm": 25301.228515625,
"learning_rate": 0.0003512,
"loss": 7.7287,
"step": 61
},
{
"epoch": 0.22984244670991658,
"grad_norm": 26147.228515625,
"learning_rate": 0.0003504,
"loss": 7.7059,
"step": 62
},
{
"epoch": 0.23354958294717332,
"grad_norm": 27329.443359375,
"learning_rate": 0.00034960000000000004,
"loss": 7.6798,
"step": 63
},
{
"epoch": 0.23725671918443003,
"grad_norm": 23415.9609375,
"learning_rate": 0.0003488,
"loss": 7.6968,
"step": 64
},
{
"epoch": 0.24096385542168675,
"grad_norm": 23625.1171875,
"learning_rate": 0.000348,
"loss": 7.7119,
"step": 65
},
{
"epoch": 0.24467099165894346,
"grad_norm": 23805.42578125,
"learning_rate": 0.00034720000000000004,
"loss": 7.6473,
"step": 66
},
{
"epoch": 0.24837812789620017,
"grad_norm": 47364.8203125,
"learning_rate": 0.0003464,
"loss": 7.7921,
"step": 67
},
{
"epoch": 0.2520852641334569,
"grad_norm": 29178.279296875,
"learning_rate": 0.0003456,
"loss": 7.6958,
"step": 68
},
{
"epoch": 0.2557924003707136,
"grad_norm": 26202.958984375,
"learning_rate": 0.00034480000000000003,
"loss": 7.7765,
"step": 69
},
{
"epoch": 0.2594995366079703,
"grad_norm": 48753.58203125,
"learning_rate": 0.000344,
"loss": 7.6496,
"step": 70
},
{
"epoch": 0.2632066728452271,
"grad_norm": 24508.509765625,
"learning_rate": 0.0003432,
"loss": 7.7125,
"step": 71
},
{
"epoch": 0.2669138090824838,
"grad_norm": 33996.55078125,
"learning_rate": 0.00034240000000000003,
"loss": 7.6635,
"step": 72
},
{
"epoch": 0.2706209453197405,
"grad_norm": 32989.36328125,
"learning_rate": 0.0003416,
"loss": 7.6893,
"step": 73
},
{
"epoch": 0.2743280815569972,
"grad_norm": 32296.1796875,
"learning_rate": 0.0003408,
"loss": 7.6696,
"step": 74
},
{
"epoch": 0.27803521779425394,
"grad_norm": 35698.16015625,
"learning_rate": 0.00034,
"loss": 7.6713,
"step": 75
},
{
"epoch": 0.28174235403151066,
"grad_norm": 25034.283203125,
"learning_rate": 0.0003392,
"loss": 7.6629,
"step": 76
},
{
"epoch": 0.28544949026876737,
"grad_norm": 36568.65625,
"learning_rate": 0.0003384,
"loss": 7.7075,
"step": 77
},
{
"epoch": 0.2891566265060241,
"grad_norm": 25048.875,
"learning_rate": 0.0003376,
"loss": 7.6727,
"step": 78
},
{
"epoch": 0.2928637627432808,
"grad_norm": 25438.61328125,
"learning_rate": 0.0003368,
"loss": 7.7028,
"step": 79
},
{
"epoch": 0.2965708989805375,
"grad_norm": 27428.9453125,
"learning_rate": 0.000336,
"loss": 7.6516,
"step": 80
},
{
"epoch": 0.3002780352177943,
"grad_norm": 32185.8125,
"learning_rate": 0.0003352,
"loss": 7.7127,
"step": 81
},
{
"epoch": 0.303985171455051,
"grad_norm": 28342.439453125,
"learning_rate": 0.0003344,
"loss": 7.6461,
"step": 82
},
{
"epoch": 0.3076923076923077,
"grad_norm": 22977.4140625,
"learning_rate": 0.0003336,
"loss": 7.6348,
"step": 83
},
{
"epoch": 0.3113994439295644,
"grad_norm": 28778.767578125,
"learning_rate": 0.0003328,
"loss": 7.6299,
"step": 84
},
{
"epoch": 0.31510658016682114,
"grad_norm": 21658.966796875,
"learning_rate": 0.000332,
"loss": 7.633,
"step": 85
},
{
"epoch": 0.31881371640407785,
"grad_norm": 22994.66796875,
"learning_rate": 0.0003312,
"loss": 7.648,
"step": 86
},
{
"epoch": 0.32252085264133457,
"grad_norm": 23064.05078125,
"learning_rate": 0.0003304,
"loss": 7.712,
"step": 87
},
{
"epoch": 0.3262279888785913,
"grad_norm": 34689.19140625,
"learning_rate": 0.0003296,
"loss": 7.6168,
"step": 88
},
{
"epoch": 0.329935125115848,
"grad_norm": 26677.1328125,
"learning_rate": 0.0003288,
"loss": 7.6226,
"step": 89
},
{
"epoch": 0.3336422613531047,
"grad_norm": 39699.62109375,
"learning_rate": 0.000328,
"loss": 7.6465,
"step": 90
},
{
"epoch": 0.3373493975903614,
"grad_norm": 47106.6640625,
"learning_rate": 0.0003272,
"loss": 7.6884,
"step": 91
},
{
"epoch": 0.3410565338276182,
"grad_norm": 30162.638671875,
"learning_rate": 0.0003264,
"loss": 7.7695,
"step": 92
},
{
"epoch": 0.3447636700648749,
"grad_norm": 40879.01953125,
"learning_rate": 0.0003256,
"loss": 7.7253,
"step": 93
},
{
"epoch": 0.3484708063021316,
"grad_norm": 56518.4921875,
"learning_rate": 0.00032480000000000003,
"loss": 7.6734,
"step": 94
},
{
"epoch": 0.35217794253938833,
"grad_norm": 37450.08203125,
"learning_rate": 0.000324,
"loss": 7.6897,
"step": 95
},
{
"epoch": 0.35588507877664505,
"grad_norm": 28603.978515625,
"learning_rate": 0.00032320000000000005,
"loss": 7.7346,
"step": 96
},
{
"epoch": 0.35959221501390176,
"grad_norm": 45344.12109375,
"learning_rate": 0.00032240000000000003,
"loss": 7.7564,
"step": 97
},
{
"epoch": 0.3632993512511585,
"grad_norm": 20206.189453125,
"learning_rate": 0.0003216,
"loss": 7.6465,
"step": 98
},
{
"epoch": 0.3670064874884152,
"grad_norm": 29952.62890625,
"learning_rate": 0.00032080000000000005,
"loss": 7.6581,
"step": 99
},
{
"epoch": 0.3707136237256719,
"grad_norm": 24017.02734375,
"learning_rate": 0.00032,
"loss": 7.7068,
"step": 100
},
{
"epoch": 0.3744207599629286,
"grad_norm": 21995.66796875,
"learning_rate": 0.0003192,
"loss": 7.7306,
"step": 101
},
{
"epoch": 0.37812789620018533,
"grad_norm": 22698.15625,
"learning_rate": 0.00031840000000000004,
"loss": 7.6167,
"step": 102
},
{
"epoch": 0.3818350324374421,
"grad_norm": 19390.587890625,
"learning_rate": 0.0003176,
"loss": 7.6298,
"step": 103
},
{
"epoch": 0.3855421686746988,
"grad_norm": 23548.39453125,
"learning_rate": 0.00031680000000000006,
"loss": 7.7148,
"step": 104
},
{
"epoch": 0.38924930491195553,
"grad_norm": 25070.564453125,
"learning_rate": 0.00031600000000000004,
"loss": 7.8045,
"step": 105
},
{
"epoch": 0.39295644114921224,
"grad_norm": 39852.94921875,
"learning_rate": 0.0003152,
"loss": 7.6813,
"step": 106
},
{
"epoch": 0.39666357738646896,
"grad_norm": 30994.017578125,
"learning_rate": 0.00031440000000000005,
"loss": 7.6801,
"step": 107
},
{
"epoch": 0.40037071362372567,
"grad_norm": 35010.94140625,
"learning_rate": 0.00031360000000000003,
"loss": 7.7625,
"step": 108
},
{
"epoch": 0.4040778498609824,
"grad_norm": 32364.001953125,
"learning_rate": 0.0003128,
"loss": 7.682,
"step": 109
},
{
"epoch": 0.4077849860982391,
"grad_norm": 24475.48828125,
"learning_rate": 0.00031200000000000005,
"loss": 7.6953,
"step": 110
},
{
"epoch": 0.4114921223354958,
"grad_norm": 28467.2890625,
"learning_rate": 0.00031120000000000003,
"loss": 7.7112,
"step": 111
},
{
"epoch": 0.4151992585727525,
"grad_norm": 46241.89453125,
"learning_rate": 0.0003104,
"loss": 7.625,
"step": 112
},
{
"epoch": 0.41890639481000924,
"grad_norm": 25736.814453125,
"learning_rate": 0.00030960000000000004,
"loss": 7.6842,
"step": 113
},
{
"epoch": 0.422613531047266,
"grad_norm": 25479.744140625,
"learning_rate": 0.0003088,
"loss": 7.7131,
"step": 114
},
{
"epoch": 0.4263206672845227,
"grad_norm": 32374.447265625,
"learning_rate": 0.000308,
"loss": 7.7209,
"step": 115
},
{
"epoch": 0.43002780352177944,
"grad_norm": 21930.126953125,
"learning_rate": 0.00030720000000000004,
"loss": 7.6593,
"step": 116
},
{
"epoch": 0.43373493975903615,
"grad_norm": 22632.013671875,
"learning_rate": 0.0003064,
"loss": 7.7121,
"step": 117
},
{
"epoch": 0.43744207599629287,
"grad_norm": 21551.6328125,
"learning_rate": 0.0003056,
"loss": 7.6504,
"step": 118
},
{
"epoch": 0.4411492122335496,
"grad_norm": 24234.326171875,
"learning_rate": 0.00030480000000000004,
"loss": 7.7,
"step": 119
},
{
"epoch": 0.4448563484708063,
"grad_norm": 27236.205078125,
"learning_rate": 0.000304,
"loss": 7.7073,
"step": 120
},
{
"epoch": 0.448563484708063,
"grad_norm": 20109.84765625,
"learning_rate": 0.0003032,
"loss": 7.642,
"step": 121
},
{
"epoch": 0.4522706209453197,
"grad_norm": 20982.546875,
"learning_rate": 0.00030240000000000003,
"loss": 7.7092,
"step": 122
},
{
"epoch": 0.45597775718257644,
"grad_norm": 30563.40625,
"learning_rate": 0.0003016,
"loss": 7.6086,
"step": 123
},
{
"epoch": 0.45968489341983315,
"grad_norm": 26537.8828125,
"learning_rate": 0.0003008,
"loss": 7.711,
"step": 124
},
{
"epoch": 0.4633920296570899,
"grad_norm": 26180.9765625,
"learning_rate": 0.00030000000000000003,
"loss": 7.6946,
"step": 125
},
{
"epoch": 0.46709916589434664,
"grad_norm": 25894.8828125,
"learning_rate": 0.0002992,
"loss": 7.6252,
"step": 126
},
{
"epoch": 0.47080630213160335,
"grad_norm": 17775.234375,
"learning_rate": 0.0002984,
"loss": 7.7064,
"step": 127
},
{
"epoch": 0.47451343836886006,
"grad_norm": 23387.5625,
"learning_rate": 0.0002976,
"loss": 7.606,
"step": 128
},
{
"epoch": 0.4782205746061168,
"grad_norm": 26294.63671875,
"learning_rate": 0.0002968,
"loss": 7.6753,
"step": 129
},
{
"epoch": 0.4819277108433735,
"grad_norm": 22350.404296875,
"learning_rate": 0.000296,
"loss": 7.6926,
"step": 130
},
{
"epoch": 0.4856348470806302,
"grad_norm": 23048.61328125,
"learning_rate": 0.0002952,
"loss": 7.6476,
"step": 131
},
{
"epoch": 0.4893419833178869,
"grad_norm": 26630.447265625,
"learning_rate": 0.0002944,
"loss": 7.7831,
"step": 132
},
{
"epoch": 0.49304911955514363,
"grad_norm": 34660.65234375,
"learning_rate": 0.00029360000000000003,
"loss": 7.5954,
"step": 133
},
{
"epoch": 0.49675625579240035,
"grad_norm": 19611.568359375,
"learning_rate": 0.0002928,
"loss": 7.6305,
"step": 134
},
{
"epoch": 0.5004633920296571,
"grad_norm": 38032.05078125,
"learning_rate": 0.000292,
"loss": 7.725,
"step": 135
},
{
"epoch": 0.5041705282669138,
"grad_norm": 26124.802734375,
"learning_rate": 0.00029120000000000003,
"loss": 7.6547,
"step": 136
},
{
"epoch": 0.5078776645041705,
"grad_norm": 22567.94921875,
"learning_rate": 0.0002904,
"loss": 7.7534,
"step": 137
},
{
"epoch": 0.5115848007414272,
"grad_norm": 37485.49609375,
"learning_rate": 0.0002896,
"loss": 7.6795,
"step": 138
},
{
"epoch": 0.5152919369786839,
"grad_norm": 32182.43359375,
"learning_rate": 0.0002888,
"loss": 7.7417,
"step": 139
},
{
"epoch": 0.5189990732159406,
"grad_norm": 24093.3125,
"learning_rate": 0.000288,
"loss": 7.6875,
"step": 140
},
{
"epoch": 0.5227062094531975,
"grad_norm": 23480.59765625,
"learning_rate": 0.0002872,
"loss": 7.6571,
"step": 141
},
{
"epoch": 0.5264133456904542,
"grad_norm": 34477.796875,
"learning_rate": 0.0002864,
"loss": 7.6389,
"step": 142
},
{
"epoch": 0.5301204819277109,
"grad_norm": 32023.896484375,
"learning_rate": 0.0002856,
"loss": 7.7501,
"step": 143
},
{
"epoch": 0.5338276181649676,
"grad_norm": 21589.513671875,
"learning_rate": 0.0002848,
"loss": 7.6895,
"step": 144
},
{
"epoch": 0.5375347544022243,
"grad_norm": 31786.94921875,
"learning_rate": 0.000284,
"loss": 7.7106,
"step": 145
},
{
"epoch": 0.541241890639481,
"grad_norm": 31673.8359375,
"learning_rate": 0.0002832,
"loss": 7.6815,
"step": 146
},
{
"epoch": 0.5449490268767377,
"grad_norm": 17670.734375,
"learning_rate": 0.0002824,
"loss": 7.6869,
"step": 147
},
{
"epoch": 0.5486561631139945,
"grad_norm": 34063.0703125,
"learning_rate": 0.0002816,
"loss": 7.7108,
"step": 148
},
{
"epoch": 0.5523632993512512,
"grad_norm": 36702.2734375,
"learning_rate": 0.0002808,
"loss": 7.7124,
"step": 149
},
{
"epoch": 0.5560704355885079,
"grad_norm": 22709.572265625,
"learning_rate": 0.00028,
"loss": 7.7326,
"step": 150
},
{
"epoch": 0.5597775718257646,
"grad_norm": 36804.21484375,
"learning_rate": 0.0002792,
"loss": 7.6414,
"step": 151
},
{
"epoch": 0.5634847080630213,
"grad_norm": 30339.912109375,
"learning_rate": 0.0002784,
"loss": 7.7337,
"step": 152
},
{
"epoch": 0.567191844300278,
"grad_norm": 31866.80859375,
"learning_rate": 0.00027759999999999997,
"loss": 7.6208,
"step": 153
},
{
"epoch": 0.5708989805375347,
"grad_norm": 23864.302734375,
"learning_rate": 0.0002768,
"loss": 7.7083,
"step": 154
},
{
"epoch": 0.5746061167747915,
"grad_norm": 29230.330078125,
"learning_rate": 0.000276,
"loss": 7.6914,
"step": 155
},
{
"epoch": 0.5783132530120482,
"grad_norm": 21988.8046875,
"learning_rate": 0.00027519999999999997,
"loss": 7.7157,
"step": 156
},
{
"epoch": 0.5820203892493049,
"grad_norm": 21070.361328125,
"learning_rate": 0.00027440000000000006,
"loss": 7.6987,
"step": 157
},
{
"epoch": 0.5857275254865616,
"grad_norm": 39177.30859375,
"learning_rate": 0.00027360000000000004,
"loss": 7.5922,
"step": 158
},
{
"epoch": 0.5894346617238183,
"grad_norm": 20961.755859375,
"learning_rate": 0.0002728,
"loss": 7.7621,
"step": 159
},
{
"epoch": 0.593141797961075,
"grad_norm": 24547.12890625,
"learning_rate": 0.00027200000000000005,
"loss": 7.7387,
"step": 160
},
{
"epoch": 0.5968489341983317,
"grad_norm": 17789.8125,
"learning_rate": 0.00027120000000000003,
"loss": 7.6818,
"step": 161
},
{
"epoch": 0.6005560704355886,
"grad_norm": 21633.90625,
"learning_rate": 0.0002704,
"loss": 7.6545,
"step": 162
},
{
"epoch": 0.6042632066728453,
"grad_norm": 17543.3046875,
"learning_rate": 0.00026960000000000005,
"loss": 7.6662,
"step": 163
},
{
"epoch": 0.607970342910102,
"grad_norm": 18747.458984375,
"learning_rate": 0.00026880000000000003,
"loss": 7.6227,
"step": 164
},
{
"epoch": 0.6116774791473587,
"grad_norm": 22172.224609375,
"learning_rate": 0.000268,
"loss": 7.6899,
"step": 165
},
{
"epoch": 0.6153846153846154,
"grad_norm": 19154.330078125,
"learning_rate": 0.00026720000000000004,
"loss": 7.6195,
"step": 166
},
{
"epoch": 0.6190917516218721,
"grad_norm": 20868.43359375,
"learning_rate": 0.0002664,
"loss": 7.677,
"step": 167
},
{
"epoch": 0.6227988878591288,
"grad_norm": 18564.533203125,
"learning_rate": 0.0002656,
"loss": 7.696,
"step": 168
},
{
"epoch": 0.6265060240963856,
"grad_norm": 22970.892578125,
"learning_rate": 0.00026480000000000004,
"loss": 7.6589,
"step": 169
},
{
"epoch": 0.6302131603336423,
"grad_norm": 18157.03515625,
"learning_rate": 0.000264,
"loss": 7.7017,
"step": 170
},
{
"epoch": 0.633920296570899,
"grad_norm": 20085.443359375,
"learning_rate": 0.0002632,
"loss": 7.7293,
"step": 171
},
{
"epoch": 0.6376274328081557,
"grad_norm": 26864.5390625,
"learning_rate": 0.00026240000000000004,
"loss": 7.5853,
"step": 172
},
{
"epoch": 0.6413345690454124,
"grad_norm": 21249.70703125,
"learning_rate": 0.0002616,
"loss": 7.7276,
"step": 173
},
{
"epoch": 0.6450417052826691,
"grad_norm": 17884.49609375,
"learning_rate": 0.0002608,
"loss": 7.7034,
"step": 174
},
{
"epoch": 0.6487488415199258,
"grad_norm": 19097.380859375,
"learning_rate": 0.00026000000000000003,
"loss": 7.7472,
"step": 175
},
{
"epoch": 0.6524559777571826,
"grad_norm": 21432.216796875,
"learning_rate": 0.0002592,
"loss": 7.7052,
"step": 176
},
{
"epoch": 0.6561631139944393,
"grad_norm": 17022.677734375,
"learning_rate": 0.00025840000000000005,
"loss": 7.7127,
"step": 177
},
{
"epoch": 0.659870250231696,
"grad_norm": 21216.20703125,
"learning_rate": 0.00025760000000000003,
"loss": 7.5911,
"step": 178
},
{
"epoch": 0.6635773864689527,
"grad_norm": 21638.240234375,
"learning_rate": 0.0002568,
"loss": 7.5969,
"step": 179
},
{
"epoch": 0.6672845227062094,
"grad_norm": 27894.361328125,
"learning_rate": 0.00025600000000000004,
"loss": 7.6331,
"step": 180
},
{
"epoch": 0.6709916589434661,
"grad_norm": 21034.33984375,
"learning_rate": 0.0002552,
"loss": 7.6371,
"step": 181
},
{
"epoch": 0.6746987951807228,
"grad_norm": 25746.513671875,
"learning_rate": 0.0002544,
"loss": 7.6462,
"step": 182
},
{
"epoch": 0.6784059314179796,
"grad_norm": 23690.24609375,
"learning_rate": 0.00025360000000000004,
"loss": 7.6662,
"step": 183
},
{
"epoch": 0.6821130676552364,
"grad_norm": 19138.052734375,
"learning_rate": 0.0002528,
"loss": 7.74,
"step": 184
},
{
"epoch": 0.6858202038924931,
"grad_norm": 20391.046875,
"learning_rate": 0.000252,
"loss": 7.7163,
"step": 185
},
{
"epoch": 0.6895273401297498,
"grad_norm": 17356.830078125,
"learning_rate": 0.00025120000000000003,
"loss": 7.6277,
"step": 186
},
{
"epoch": 0.6932344763670065,
"grad_norm": 27145.943359375,
"learning_rate": 0.0002504,
"loss": 7.7351,
"step": 187
},
{
"epoch": 0.6969416126042632,
"grad_norm": 18061.5703125,
"learning_rate": 0.0002496,
"loss": 7.6895,
"step": 188
},
{
"epoch": 0.70064874884152,
"grad_norm": 17943.388671875,
"learning_rate": 0.00024880000000000003,
"loss": 7.7073,
"step": 189
},
{
"epoch": 0.7043558850787767,
"grad_norm": 19911.068359375,
"learning_rate": 0.000248,
"loss": 7.7247,
"step": 190
},
{
"epoch": 0.7080630213160334,
"grad_norm": 23313.1328125,
"learning_rate": 0.0002472,
"loss": 7.6459,
"step": 191
},
{
"epoch": 0.7117701575532901,
"grad_norm": 18374.34375,
"learning_rate": 0.0002464,
"loss": 7.6853,
"step": 192
},
{
"epoch": 0.7154772937905468,
"grad_norm": 18763.783203125,
"learning_rate": 0.0002456,
"loss": 7.6097,
"step": 193
},
{
"epoch": 0.7191844300278035,
"grad_norm": 18051.265625,
"learning_rate": 0.0002448,
"loss": 7.6265,
"step": 194
},
{
"epoch": 0.7228915662650602,
"grad_norm": 21930.23828125,
"learning_rate": 0.000244,
"loss": 7.7064,
"step": 195
},
{
"epoch": 0.726598702502317,
"grad_norm": 21661.873046875,
"learning_rate": 0.0002432,
"loss": 7.7374,
"step": 196
},
{
"epoch": 0.7303058387395737,
"grad_norm": 26628.837890625,
"learning_rate": 0.0002424,
"loss": 7.6806,
"step": 197
},
{
"epoch": 0.7340129749768304,
"grad_norm": 24882.0234375,
"learning_rate": 0.0002416,
"loss": 7.6327,
"step": 198
},
{
"epoch": 0.7377201112140871,
"grad_norm": 25492.328125,
"learning_rate": 0.0002408,
"loss": 7.6956,
"step": 199
},
{
"epoch": 0.7414272474513438,
"grad_norm": 27734.201171875,
"learning_rate": 0.00024,
"loss": 7.6169,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.78927800680448e+16,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}