|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 1858, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005382131324004305, |
|
"grad_norm": 24.24283353853933, |
|
"learning_rate": 5.376344086021506e-08, |
|
"loss": 1.3511, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002691065662002153, |
|
"grad_norm": 23.053349252253323, |
|
"learning_rate": 2.688172043010753e-07, |
|
"loss": 1.3555, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005382131324004306, |
|
"grad_norm": 16.288788394813515, |
|
"learning_rate": 5.376344086021506e-07, |
|
"loss": 1.2951, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008073196986006458, |
|
"grad_norm": 11.694670527806275, |
|
"learning_rate": 8.064516129032258e-07, |
|
"loss": 1.1467, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.010764262648008612, |
|
"grad_norm": 7.884187554302874, |
|
"learning_rate": 1.0752688172043011e-06, |
|
"loss": 1.0532, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013455328310010764, |
|
"grad_norm": 3.4493748603394936, |
|
"learning_rate": 1.3440860215053765e-06, |
|
"loss": 0.9368, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.016146393972012917, |
|
"grad_norm": 3.4017609070253316, |
|
"learning_rate": 1.6129032258064516e-06, |
|
"loss": 0.894, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01883745963401507, |
|
"grad_norm": 2.922525859096027, |
|
"learning_rate": 1.881720430107527e-06, |
|
"loss": 0.8884, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.021528525296017224, |
|
"grad_norm": 2.9838957239114157, |
|
"learning_rate": 2.1505376344086023e-06, |
|
"loss": 0.8546, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.024219590958019375, |
|
"grad_norm": 2.910704024976776, |
|
"learning_rate": 2.4193548387096776e-06, |
|
"loss": 0.8418, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02691065662002153, |
|
"grad_norm": 2.9651959473545695, |
|
"learning_rate": 2.688172043010753e-06, |
|
"loss": 0.8278, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.029601722282023683, |
|
"grad_norm": 3.171250875081523, |
|
"learning_rate": 2.9569892473118283e-06, |
|
"loss": 0.8258, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03229278794402583, |
|
"grad_norm": 3.0480176002644757, |
|
"learning_rate": 3.225806451612903e-06, |
|
"loss": 0.8113, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03498385360602799, |
|
"grad_norm": 2.9903885530764565, |
|
"learning_rate": 3.494623655913979e-06, |
|
"loss": 0.7944, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03767491926803014, |
|
"grad_norm": 2.9196546664543237, |
|
"learning_rate": 3.763440860215054e-06, |
|
"loss": 0.7898, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.040365984930032295, |
|
"grad_norm": 3.228421235463176, |
|
"learning_rate": 4.032258064516129e-06, |
|
"loss": 0.7866, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04305705059203445, |
|
"grad_norm": 3.1171353517571374, |
|
"learning_rate": 4.3010752688172045e-06, |
|
"loss": 0.7682, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.045748116254036596, |
|
"grad_norm": 3.4510773308729403, |
|
"learning_rate": 4.56989247311828e-06, |
|
"loss": 0.7596, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04843918191603875, |
|
"grad_norm": 3.038630690638718, |
|
"learning_rate": 4.838709677419355e-06, |
|
"loss": 0.766, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.051130247578040904, |
|
"grad_norm": 3.0719133029997576, |
|
"learning_rate": 5.1075268817204305e-06, |
|
"loss": 0.7505, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05382131324004306, |
|
"grad_norm": 3.3743499896958564, |
|
"learning_rate": 5.376344086021506e-06, |
|
"loss": 0.7589, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05651237890204521, |
|
"grad_norm": 3.1873005700182744, |
|
"learning_rate": 5.645161290322582e-06, |
|
"loss": 0.7394, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.059203444564047365, |
|
"grad_norm": 3.0033872873929486, |
|
"learning_rate": 5.9139784946236566e-06, |
|
"loss": 0.7335, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06189451022604951, |
|
"grad_norm": 3.082902596137149, |
|
"learning_rate": 6.182795698924732e-06, |
|
"loss": 0.7423, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.06458557588805167, |
|
"grad_norm": 3.120043164103206, |
|
"learning_rate": 6.451612903225806e-06, |
|
"loss": 0.7315, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06727664155005382, |
|
"grad_norm": 2.8340351071709016, |
|
"learning_rate": 6.720430107526882e-06, |
|
"loss": 0.7351, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06996770721205597, |
|
"grad_norm": 2.9964803457547577, |
|
"learning_rate": 6.989247311827958e-06, |
|
"loss": 0.7186, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07265877287405813, |
|
"grad_norm": 3.04318708337157, |
|
"learning_rate": 7.258064516129033e-06, |
|
"loss": 0.7136, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.07534983853606028, |
|
"grad_norm": 2.9402967521903665, |
|
"learning_rate": 7.526881720430108e-06, |
|
"loss": 0.7239, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07804090419806244, |
|
"grad_norm": 2.8263401546915987, |
|
"learning_rate": 7.795698924731183e-06, |
|
"loss": 0.7144, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.08073196986006459, |
|
"grad_norm": 2.919173396075121, |
|
"learning_rate": 8.064516129032258e-06, |
|
"loss": 0.7314, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08342303552206674, |
|
"grad_norm": 3.1452767430819994, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.7174, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0861141011840689, |
|
"grad_norm": 2.86414031815779, |
|
"learning_rate": 8.602150537634409e-06, |
|
"loss": 0.7104, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08880516684607104, |
|
"grad_norm": 2.885400603011771, |
|
"learning_rate": 8.870967741935484e-06, |
|
"loss": 0.7021, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.09149623250807319, |
|
"grad_norm": 3.0389456906279326, |
|
"learning_rate": 9.13978494623656e-06, |
|
"loss": 0.7106, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.09418729817007535, |
|
"grad_norm": 2.7772656034353838, |
|
"learning_rate": 9.408602150537635e-06, |
|
"loss": 0.7044, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.0968783638320775, |
|
"grad_norm": 2.8113973155660408, |
|
"learning_rate": 9.67741935483871e-06, |
|
"loss": 0.6972, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09956942949407965, |
|
"grad_norm": 2.674758035204334, |
|
"learning_rate": 9.946236559139786e-06, |
|
"loss": 0.714, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.10226049515608181, |
|
"grad_norm": 3.0385651405527847, |
|
"learning_rate": 9.999858783596665e-06, |
|
"loss": 0.7141, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.10495156081808396, |
|
"grad_norm": 4.143402859397059, |
|
"learning_rate": 9.999285105629308e-06, |
|
"loss": 0.7218, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.10764262648008611, |
|
"grad_norm": 2.6943466863988386, |
|
"learning_rate": 9.998270190666602e-06, |
|
"loss": 0.6953, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11033369214208827, |
|
"grad_norm": 2.5357089850296073, |
|
"learning_rate": 9.99681412828496e-06, |
|
"loss": 0.7083, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.11302475780409042, |
|
"grad_norm": 2.9744391695253074, |
|
"learning_rate": 9.994917046996472e-06, |
|
"loss": 0.7091, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.11571582346609258, |
|
"grad_norm": 2.612058036649499, |
|
"learning_rate": 9.99257911423757e-06, |
|
"loss": 0.7011, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.11840688912809473, |
|
"grad_norm": 2.9567175133198296, |
|
"learning_rate": 9.989800536354243e-06, |
|
"loss": 0.6911, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.12109795479009688, |
|
"grad_norm": 2.698848753772084, |
|
"learning_rate": 9.986581558583824e-06, |
|
"loss": 0.7089, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.12378902045209902, |
|
"grad_norm": 2.8530687570791873, |
|
"learning_rate": 9.98292246503335e-06, |
|
"loss": 0.683, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1264800861141012, |
|
"grad_norm": 2.680876650789018, |
|
"learning_rate": 9.978823578654486e-06, |
|
"loss": 0.6921, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.12917115177610333, |
|
"grad_norm": 2.7266477180061193, |
|
"learning_rate": 9.97428526121502e-06, |
|
"loss": 0.7068, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1318622174381055, |
|
"grad_norm": 2.549125398235196, |
|
"learning_rate": 9.969307913266931e-06, |
|
"loss": 0.6942, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.13455328310010764, |
|
"grad_norm": 2.6548338170679417, |
|
"learning_rate": 9.963891974111042e-06, |
|
"loss": 0.6866, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1372443487621098, |
|
"grad_norm": 2.5403709133766075, |
|
"learning_rate": 9.958037921758241e-06, |
|
"loss": 0.6842, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.13993541442411195, |
|
"grad_norm": 2.573538112309061, |
|
"learning_rate": 9.951746272887298e-06, |
|
"loss": 0.683, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1426264800861141, |
|
"grad_norm": 2.54743165429096, |
|
"learning_rate": 9.945017582799256e-06, |
|
"loss": 0.6768, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.14531754574811626, |
|
"grad_norm": 2.710982640485438, |
|
"learning_rate": 9.937852445368427e-06, |
|
"loss": 0.6797, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1480086114101184, |
|
"grad_norm": 2.6229565132141244, |
|
"learning_rate": 9.930251492989972e-06, |
|
"loss": 0.6815, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.15069967707212056, |
|
"grad_norm": 2.458184332927479, |
|
"learning_rate": 9.922215396524089e-06, |
|
"loss": 0.6774, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1533907427341227, |
|
"grad_norm": 2.5496173691872333, |
|
"learning_rate": 9.913744865236798e-06, |
|
"loss": 0.6804, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.15608180839612487, |
|
"grad_norm": 2.6173567920053915, |
|
"learning_rate": 9.904840646737346e-06, |
|
"loss": 0.6735, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.158772874058127, |
|
"grad_norm": 2.510183361119406, |
|
"learning_rate": 9.895503526912224e-06, |
|
"loss": 0.679, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.16146393972012918, |
|
"grad_norm": 2.6302179408656863, |
|
"learning_rate": 9.885734329855798e-06, |
|
"loss": 0.6767, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.16415500538213132, |
|
"grad_norm": 2.4779457702117966, |
|
"learning_rate": 9.875533917797579e-06, |
|
"loss": 0.6583, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.1668460710441335, |
|
"grad_norm": 2.565978452487799, |
|
"learning_rate": 9.864903191026125e-06, |
|
"loss": 0.6601, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.16953713670613563, |
|
"grad_norm": 2.5755175403373594, |
|
"learning_rate": 9.853843087809574e-06, |
|
"loss": 0.6729, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1722282023681378, |
|
"grad_norm": 2.4549967571921276, |
|
"learning_rate": 9.842354584312841e-06, |
|
"loss": 0.6863, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.17491926803013993, |
|
"grad_norm": 2.603864578009155, |
|
"learning_rate": 9.830438694511454e-06, |
|
"loss": 0.6667, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.17761033369214208, |
|
"grad_norm": 2.5406282084218237, |
|
"learning_rate": 9.818096470102067e-06, |
|
"loss": 0.6674, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.18030139935414424, |
|
"grad_norm": 2.579463338921953, |
|
"learning_rate": 9.805329000409634e-06, |
|
"loss": 0.6591, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.18299246501614638, |
|
"grad_norm": 2.6031990059371446, |
|
"learning_rate": 9.792137412291265e-06, |
|
"loss": 0.6541, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.18568353067814855, |
|
"grad_norm": 2.595725262832742, |
|
"learning_rate": 9.778522870036768e-06, |
|
"loss": 0.659, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.1883745963401507, |
|
"grad_norm": 2.5089835346062235, |
|
"learning_rate": 9.764486575265893e-06, |
|
"loss": 0.6402, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.19106566200215286, |
|
"grad_norm": 2.5469279751039444, |
|
"learning_rate": 9.750029766822277e-06, |
|
"loss": 0.663, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.193756727664155, |
|
"grad_norm": 2.4704112213266236, |
|
"learning_rate": 9.735153720664096e-06, |
|
"loss": 0.6401, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.19644779332615717, |
|
"grad_norm": 2.406691801866414, |
|
"learning_rate": 9.719859749751462e-06, |
|
"loss": 0.6637, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.1991388589881593, |
|
"grad_norm": 2.42146963438233, |
|
"learning_rate": 9.704149203930522e-06, |
|
"loss": 0.6547, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.20182992465016147, |
|
"grad_norm": 1336.2542331514433, |
|
"learning_rate": 9.688023469814345e-06, |
|
"loss": 0.6303, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.20452099031216361, |
|
"grad_norm": 2.4257729190218087, |
|
"learning_rate": 9.671483970660519e-06, |
|
"loss": 0.6354, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.20721205597416578, |
|
"grad_norm": 2.7259145716285977, |
|
"learning_rate": 9.654532166245543e-06, |
|
"loss": 0.6497, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.20990312163616792, |
|
"grad_norm": 2.4574395200249968, |
|
"learning_rate": 9.637169552735985e-06, |
|
"loss": 0.6345, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.21259418729817006, |
|
"grad_norm": 2.3325658754447547, |
|
"learning_rate": 9.619397662556434e-06, |
|
"loss": 0.6391, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.21528525296017223, |
|
"grad_norm": 2.431806866952552, |
|
"learning_rate": 9.601218064254245e-06, |
|
"loss": 0.6173, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.21797631862217437, |
|
"grad_norm": 2.445217378104123, |
|
"learning_rate": 9.582632362361098e-06, |
|
"loss": 0.6401, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.22066738428417654, |
|
"grad_norm": 2.6477368345894092, |
|
"learning_rate": 9.563642197251382e-06, |
|
"loss": 0.6488, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.22335844994617868, |
|
"grad_norm": 2.4707475353296493, |
|
"learning_rate": 9.54424924499742e-06, |
|
"loss": 0.6403, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.22604951560818085, |
|
"grad_norm": 2.494450021488585, |
|
"learning_rate": 9.524455217221537e-06, |
|
"loss": 0.6384, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.22874058127018299, |
|
"grad_norm": 2.4591557732917506, |
|
"learning_rate": 9.504261860944984e-06, |
|
"loss": 0.6324, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.23143164693218515, |
|
"grad_norm": 2.4302912835831996, |
|
"learning_rate": 9.48367095843376e-06, |
|
"loss": 0.6117, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2341227125941873, |
|
"grad_norm": 2.313645143696204, |
|
"learning_rate": 9.462684327041298e-06, |
|
"loss": 0.6313, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.23681377825618946, |
|
"grad_norm": 2.5570998142302237, |
|
"learning_rate": 9.441303819048073e-06, |
|
"loss": 0.6375, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2395048439181916, |
|
"grad_norm": 2.3759237627339362, |
|
"learning_rate": 9.41953132149811e-06, |
|
"loss": 0.64, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.24219590958019377, |
|
"grad_norm": 2.496872534117237, |
|
"learning_rate": 9.397368756032445e-06, |
|
"loss": 0.6295, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2448869752421959, |
|
"grad_norm": 2.4744584085996166, |
|
"learning_rate": 9.374818078719515e-06, |
|
"loss": 0.6352, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.24757804090419805, |
|
"grad_norm": 2.392085926832327, |
|
"learning_rate": 9.351881279882512e-06, |
|
"loss": 0.6223, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2502691065662002, |
|
"grad_norm": 2.435778322562653, |
|
"learning_rate": 9.328560383923724e-06, |
|
"loss": 0.6293, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.2529601722282024, |
|
"grad_norm": 2.4713882620232814, |
|
"learning_rate": 9.304857449145858e-06, |
|
"loss": 0.616, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2556512378902045, |
|
"grad_norm": 2.4022713737717085, |
|
"learning_rate": 9.280774567570372e-06, |
|
"loss": 0.6278, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.25834230355220666, |
|
"grad_norm": 2.5659042225767106, |
|
"learning_rate": 9.256313864752838e-06, |
|
"loss": 0.6117, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.26103336921420883, |
|
"grad_norm": 2.369609674870893, |
|
"learning_rate": 9.231477499595333e-06, |
|
"loss": 0.6154, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.263724434876211, |
|
"grad_norm": 2.541597496063988, |
|
"learning_rate": 9.206267664155906e-06, |
|
"loss": 0.596, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2664155005382131, |
|
"grad_norm": 2.497006686449089, |
|
"learning_rate": 9.180686583455097e-06, |
|
"loss": 0.6013, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.2691065662002153, |
|
"grad_norm": 2.310023327152278, |
|
"learning_rate": 9.154736515279557e-06, |
|
"loss": 0.6065, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.27179763186221745, |
|
"grad_norm": 2.377810706006744, |
|
"learning_rate": 9.12841974998278e-06, |
|
"loss": 0.6073, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.2744886975242196, |
|
"grad_norm": 2.488273336117645, |
|
"learning_rate": 9.101738610282956e-06, |
|
"loss": 0.605, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.27717976318622173, |
|
"grad_norm": 2.5444483959678705, |
|
"learning_rate": 9.074695451057966e-06, |
|
"loss": 0.5896, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.2798708288482239, |
|
"grad_norm": 2.532157806962122, |
|
"learning_rate": 9.047292659137542e-06, |
|
"loss": 0.6011, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.28256189451022606, |
|
"grad_norm": 2.4637500565359636, |
|
"learning_rate": 9.019532653092597e-06, |
|
"loss": 0.583, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.2852529601722282, |
|
"grad_norm": 2.4583344155487237, |
|
"learning_rate": 8.99141788302178e-06, |
|
"loss": 0.6061, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.28794402583423034, |
|
"grad_norm": 2.657382385667294, |
|
"learning_rate": 8.962950830335213e-06, |
|
"loss": 0.5966, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.2906350914962325, |
|
"grad_norm": 2.3954255779376723, |
|
"learning_rate": 8.93413400753549e-06, |
|
"loss": 0.6012, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2933261571582347, |
|
"grad_norm": 2.4583747267385285, |
|
"learning_rate": 8.90496995799592e-06, |
|
"loss": 0.5986, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.2960172228202368, |
|
"grad_norm": 2.3658026224876774, |
|
"learning_rate": 8.875461255736055e-06, |
|
"loss": 0.579, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.29870828848223896, |
|
"grad_norm": 2.4605033576758415, |
|
"learning_rate": 8.845610505194495e-06, |
|
"loss": 0.5931, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.3013993541442411, |
|
"grad_norm": 2.4119728213714664, |
|
"learning_rate": 8.815420340999034e-06, |
|
"loss": 0.5947, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3040904198062433, |
|
"grad_norm": 2.34883220316883, |
|
"learning_rate": 8.784893427734117e-06, |
|
"loss": 0.5834, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.3067814854682454, |
|
"grad_norm": 2.271640059488949, |
|
"learning_rate": 8.754032459705672e-06, |
|
"loss": 0.5813, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3094725511302476, |
|
"grad_norm": 2.2755797451544586, |
|
"learning_rate": 8.722840160703304e-06, |
|
"loss": 0.5726, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.31216361679224974, |
|
"grad_norm": 2.307185651566321, |
|
"learning_rate": 8.691319283759896e-06, |
|
"loss": 0.5954, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.3148546824542519, |
|
"grad_norm": 2.4397904508735397, |
|
"learning_rate": 8.659472610908628e-06, |
|
"loss": 0.5739, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.317545748116254, |
|
"grad_norm": 2.399974851009762, |
|
"learning_rate": 8.627302952937431e-06, |
|
"loss": 0.5691, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3202368137782562, |
|
"grad_norm": 2.366339579258752, |
|
"learning_rate": 8.594813149140908e-06, |
|
"loss": 0.5799, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.32292787944025836, |
|
"grad_norm": 2.500951480557945, |
|
"learning_rate": 8.56200606706974e-06, |
|
"loss": 0.5676, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.32561894510226047, |
|
"grad_norm": 2.4259225623593546, |
|
"learning_rate": 8.528884602277593e-06, |
|
"loss": 0.5696, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.32831001076426264, |
|
"grad_norm": 2.5353727059026423, |
|
"learning_rate": 8.495451678065563e-06, |
|
"loss": 0.5714, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3310010764262648, |
|
"grad_norm": 2.342282674374617, |
|
"learning_rate": 8.461710245224149e-06, |
|
"loss": 0.5786, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.333692142088267, |
|
"grad_norm": 2.3739335512730873, |
|
"learning_rate": 8.42766328177284e-06, |
|
"loss": 0.5645, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3363832077502691, |
|
"grad_norm": 2.4085412512734514, |
|
"learning_rate": 8.393313792697251e-06, |
|
"loss": 0.5691, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.33907427341227125, |
|
"grad_norm": 2.518154153319773, |
|
"learning_rate": 8.358664809683926e-06, |
|
"loss": 0.5598, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3417653390742734, |
|
"grad_norm": 2.3735183991654094, |
|
"learning_rate": 8.323719390852735e-06, |
|
"loss": 0.5624, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.3444564047362756, |
|
"grad_norm": 2.513284443243831, |
|
"learning_rate": 8.288480620486991e-06, |
|
"loss": 0.5666, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3471474703982777, |
|
"grad_norm": 2.657908476894027, |
|
"learning_rate": 8.252951608761217e-06, |
|
"loss": 0.5538, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.34983853606027987, |
|
"grad_norm": 2.6512730585233046, |
|
"learning_rate": 8.217135491466636e-06, |
|
"loss": 0.562, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.35252960172228204, |
|
"grad_norm": 2.413492734920311, |
|
"learning_rate": 8.181035429734423e-06, |
|
"loss": 0.5684, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.35522066738428415, |
|
"grad_norm": 2.389143978534841, |
|
"learning_rate": 8.144654609756685e-06, |
|
"loss": 0.5584, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3579117330462863, |
|
"grad_norm": 2.3885593909778193, |
|
"learning_rate": 8.10799624250527e-06, |
|
"loss": 0.5623, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.3606027987082885, |
|
"grad_norm": 2.3537366149001575, |
|
"learning_rate": 8.071063563448341e-06, |
|
"loss": 0.558, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.36329386437029065, |
|
"grad_norm": 2.431516026741045, |
|
"learning_rate": 8.03385983226483e-06, |
|
"loss": 0.5606, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.36598493003229277, |
|
"grad_norm": 2.5156730038085247, |
|
"learning_rate": 7.996388332556735e-06, |
|
"loss": 0.5498, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.36867599569429493, |
|
"grad_norm": 2.3917209485074125, |
|
"learning_rate": 7.958652371559313e-06, |
|
"loss": 0.5452, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.3713670613562971, |
|
"grad_norm": 2.394278884481158, |
|
"learning_rate": 7.920655279849173e-06, |
|
"loss": 0.5366, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.37405812701829927, |
|
"grad_norm": 2.3161033892573397, |
|
"learning_rate": 7.882400411050328e-06, |
|
"loss": 0.5574, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.3767491926803014, |
|
"grad_norm": 2.42710195896889, |
|
"learning_rate": 7.843891141538201e-06, |
|
"loss": 0.5529, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.37944025834230355, |
|
"grad_norm": 2.432838785663918, |
|
"learning_rate": 7.80513087014163e-06, |
|
"loss": 0.5269, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.3821313240043057, |
|
"grad_norm": 2.3546373641715985, |
|
"learning_rate": 7.766123017842877e-06, |
|
"loss": 0.5594, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3848223896663079, |
|
"grad_norm": 2.4291021306230705, |
|
"learning_rate": 7.726871027475709e-06, |
|
"loss": 0.5465, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.38751345532831, |
|
"grad_norm": 2.4162151105545764, |
|
"learning_rate": 7.687378363421512e-06, |
|
"loss": 0.5197, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.39020452099031216, |
|
"grad_norm": 2.4159850873626896, |
|
"learning_rate": 7.647648511303545e-06, |
|
"loss": 0.5395, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.39289558665231433, |
|
"grad_norm": 2.453890499939525, |
|
"learning_rate": 7.607684977679284e-06, |
|
"loss": 0.5253, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.39558665231431644, |
|
"grad_norm": 2.36425752479743, |
|
"learning_rate": 7.567491289730944e-06, |
|
"loss": 0.5423, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.3982777179763186, |
|
"grad_norm": 2.7992812996955343, |
|
"learning_rate": 7.52707099495416e-06, |
|
"loss": 0.5312, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4009687836383208, |
|
"grad_norm": 2.308832567539322, |
|
"learning_rate": 7.4864276608448925e-06, |
|
"loss": 0.5304, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.40365984930032295, |
|
"grad_norm": 2.3132410981713822, |
|
"learning_rate": 7.44556487458456e-06, |
|
"loss": 0.5289, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.40635091496232506, |
|
"grad_norm": 2.481793941375617, |
|
"learning_rate": 7.404486242723428e-06, |
|
"loss": 0.5447, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.40904198062432723, |
|
"grad_norm": 2.3277863998931085, |
|
"learning_rate": 7.363195390862298e-06, |
|
"loss": 0.5281, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.4117330462863294, |
|
"grad_norm": 3.1626258112156336, |
|
"learning_rate": 7.321695963332516e-06, |
|
"loss": 0.5297, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.41442411194833156, |
|
"grad_norm": 2.3973285291004935, |
|
"learning_rate": 7.279991622874319e-06, |
|
"loss": 0.51, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.4171151776103337, |
|
"grad_norm": 2.5519368199855186, |
|
"learning_rate": 7.238086050313563e-06, |
|
"loss": 0.5204, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.41980624327233584, |
|
"grad_norm": 2.2307291258782573, |
|
"learning_rate": 7.195982944236853e-06, |
|
"loss": 0.5139, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.422497308934338, |
|
"grad_norm": 2.3366059256739455, |
|
"learning_rate": 7.1536860206651025e-06, |
|
"loss": 0.5125, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.4251883745963401, |
|
"grad_norm": 2.39046452051348, |
|
"learning_rate": 7.1111990127255684e-06, |
|
"loss": 0.5045, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4278794402583423, |
|
"grad_norm": 2.325513036750954, |
|
"learning_rate": 7.068525670322349e-06, |
|
"loss": 0.5175, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.43057050592034446, |
|
"grad_norm": 2.65907942713296, |
|
"learning_rate": 7.025669759805431e-06, |
|
"loss": 0.5124, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4332615715823466, |
|
"grad_norm": 2.371613534546568, |
|
"learning_rate": 6.982635063638265e-06, |
|
"loss": 0.5047, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.43595263724434874, |
|
"grad_norm": 2.581116622093549, |
|
"learning_rate": 6.939425380063924e-06, |
|
"loss": 0.5086, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.4386437029063509, |
|
"grad_norm": 2.483792729537892, |
|
"learning_rate": 6.896044522769879e-06, |
|
"loss": 0.5238, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.4413347685683531, |
|
"grad_norm": 2.218951180296122, |
|
"learning_rate": 6.852496320551387e-06, |
|
"loss": 0.5208, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.44402583423035524, |
|
"grad_norm": 2.3565229050589087, |
|
"learning_rate": 6.808784616973581e-06, |
|
"loss": 0.5055, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.44671689989235736, |
|
"grad_norm": 2.3807615934870556, |
|
"learning_rate": 6.76491327003222e-06, |
|
"loss": 0.5116, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.4494079655543595, |
|
"grad_norm": 2.267351146002801, |
|
"learning_rate": 6.720886151813194e-06, |
|
"loss": 0.4973, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.4520990312163617, |
|
"grad_norm": 2.288675152175709, |
|
"learning_rate": 6.676707148150763e-06, |
|
"loss": 0.5027, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.45479009687836386, |
|
"grad_norm": 2.1920166975520607, |
|
"learning_rate": 6.632380158284607e-06, |
|
"loss": 0.5004, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.45748116254036597, |
|
"grad_norm": 2.4271726465058734, |
|
"learning_rate": 6.587909094515663e-06, |
|
"loss": 0.4945, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.46017222820236814, |
|
"grad_norm": 2.4568232816821767, |
|
"learning_rate": 6.5432978818608395e-06, |
|
"loss": 0.5072, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.4628632938643703, |
|
"grad_norm": 2.3137099886613552, |
|
"learning_rate": 6.498550457706584e-06, |
|
"loss": 0.4933, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4655543595263724, |
|
"grad_norm": 2.3214692306145497, |
|
"learning_rate": 6.453670771461377e-06, |
|
"loss": 0.4824, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.4682454251883746, |
|
"grad_norm": 2.4425553236207014, |
|
"learning_rate": 6.408662784207149e-06, |
|
"loss": 0.49, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.47093649085037675, |
|
"grad_norm": 2.4523524920489725, |
|
"learning_rate": 6.363530468349686e-06, |
|
"loss": 0.4877, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.4736275565123789, |
|
"grad_norm": 2.3770320295081517, |
|
"learning_rate": 6.318277807268013e-06, |
|
"loss": 0.4921, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.47631862217438103, |
|
"grad_norm": 2.302817515096427, |
|
"learning_rate": 6.27290879496283e-06, |
|
"loss": 0.499, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.4790096878363832, |
|
"grad_norm": 2.3749605927816115, |
|
"learning_rate": 6.227427435703997e-06, |
|
"loss": 0.5034, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.48170075349838537, |
|
"grad_norm": 2.2698944933417047, |
|
"learning_rate": 6.181837743677118e-06, |
|
"loss": 0.4836, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.48439181916038754, |
|
"grad_norm": 2.421109939464108, |
|
"learning_rate": 6.136143742629252e-06, |
|
"loss": 0.4844, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.48708288482238965, |
|
"grad_norm": 2.3591927038924205, |
|
"learning_rate": 6.09034946551377e-06, |
|
"loss": 0.4916, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.4897739504843918, |
|
"grad_norm": 2.2834315500664375, |
|
"learning_rate": 6.044458954134411e-06, |
|
"loss": 0.4818, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.492465016146394, |
|
"grad_norm": 2.319135479132979, |
|
"learning_rate": 5.998476258788555e-06, |
|
"loss": 0.4706, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.4951560818083961, |
|
"grad_norm": 2.318871753425039, |
|
"learning_rate": 5.952405437909738e-06, |
|
"loss": 0.4808, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.49784714747039827, |
|
"grad_norm": 2.286261647310775, |
|
"learning_rate": 5.90625055770946e-06, |
|
"loss": 0.4745, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.5005382131324004, |
|
"grad_norm": 2.40479662626196, |
|
"learning_rate": 5.860015691818292e-06, |
|
"loss": 0.4759, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5032292787944026, |
|
"grad_norm": 2.2745670638148865, |
|
"learning_rate": 5.813704920926352e-06, |
|
"loss": 0.4676, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.5059203444564048, |
|
"grad_norm": 2.3623230849926586, |
|
"learning_rate": 5.767322332423128e-06, |
|
"loss": 0.485, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5086114101184069, |
|
"grad_norm": 2.2862433160959754, |
|
"learning_rate": 5.720872020036734e-06, |
|
"loss": 0.4609, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.511302475780409, |
|
"grad_norm": 2.3723463853439566, |
|
"learning_rate": 5.674358083472598e-06, |
|
"loss": 0.4845, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5139935414424112, |
|
"grad_norm": 2.410416517790562, |
|
"learning_rate": 5.6277846280516125e-06, |
|
"loss": 0.4619, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.5166846071044133, |
|
"grad_norm": 2.240918133117095, |
|
"learning_rate": 5.581155764347812e-06, |
|
"loss": 0.4791, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5193756727664155, |
|
"grad_norm": 2.377935217580519, |
|
"learning_rate": 5.534475607825566e-06, |
|
"loss": 0.4675, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.5220667384284177, |
|
"grad_norm": 2.2782957919391134, |
|
"learning_rate": 5.487748278476342e-06, |
|
"loss": 0.458, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5247578040904198, |
|
"grad_norm": 2.366031689261752, |
|
"learning_rate": 5.440977900455093e-06, |
|
"loss": 0.4555, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.527448869752422, |
|
"grad_norm": 2.28355819182803, |
|
"learning_rate": 5.39416860171624e-06, |
|
"loss": 0.4665, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5301399354144241, |
|
"grad_norm": 2.3582263483375967, |
|
"learning_rate": 5.347324513649352e-06, |
|
"loss": 0.4536, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.5328310010764262, |
|
"grad_norm": 2.4053179534063758, |
|
"learning_rate": 5.300449770714502e-06, |
|
"loss": 0.4488, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5355220667384284, |
|
"grad_norm": 2.4724751592286545, |
|
"learning_rate": 5.253548510077366e-06, |
|
"loss": 0.4464, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.5382131324004306, |
|
"grad_norm": 2.4025395912753233, |
|
"learning_rate": 5.206624871244066e-06, |
|
"loss": 0.4637, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5409041980624327, |
|
"grad_norm": 2.3589159913710045, |
|
"learning_rate": 5.159682995695833e-06, |
|
"loss": 0.4598, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.5435952637244349, |
|
"grad_norm": 2.3402445469371003, |
|
"learning_rate": 5.112727026523461e-06, |
|
"loss": 0.4581, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5462863293864371, |
|
"grad_norm": 2.308875795125107, |
|
"learning_rate": 5.065761108061658e-06, |
|
"loss": 0.4554, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.5489773950484392, |
|
"grad_norm": 2.2520574678056025, |
|
"learning_rate": 5.018789385523245e-06, |
|
"loss": 0.4311, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5516684607104413, |
|
"grad_norm": 2.3460733596653043, |
|
"learning_rate": 4.971816004633323e-06, |
|
"loss": 0.4562, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.5543595263724435, |
|
"grad_norm": 2.295752331592005, |
|
"learning_rate": 4.924845111263349e-06, |
|
"loss": 0.4534, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5570505920344456, |
|
"grad_norm": 2.288064876450578, |
|
"learning_rate": 4.877880851065238e-06, |
|
"loss": 0.4373, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.5597416576964478, |
|
"grad_norm": 2.3406651223445345, |
|
"learning_rate": 4.830927369105457e-06, |
|
"loss": 0.4556, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.56243272335845, |
|
"grad_norm": 2.361735026112144, |
|
"learning_rate": 4.783988809499187e-06, |
|
"loss": 0.4463, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.5651237890204521, |
|
"grad_norm": 2.3200974121365827, |
|
"learning_rate": 4.737069315044562e-06, |
|
"loss": 0.4447, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5678148546824543, |
|
"grad_norm": 2.3623013069638588, |
|
"learning_rate": 4.690173026857028e-06, |
|
"loss": 0.4526, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.5705059203444564, |
|
"grad_norm": 2.3444553715914793, |
|
"learning_rate": 4.643304084003839e-06, |
|
"loss": 0.4375, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5731969860064585, |
|
"grad_norm": 2.2744066980818136, |
|
"learning_rate": 4.596466623138756e-06, |
|
"loss": 0.4431, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.5758880516684607, |
|
"grad_norm": 2.3747643214433376, |
|
"learning_rate": 4.549664778136933e-06, |
|
"loss": 0.4408, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5785791173304629, |
|
"grad_norm": 2.2085805209914104, |
|
"learning_rate": 4.502902679730074e-06, |
|
"loss": 0.4372, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.581270182992465, |
|
"grad_norm": 2.317483774738, |
|
"learning_rate": 4.456184455141843e-06, |
|
"loss": 0.4305, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5839612486544672, |
|
"grad_norm": 2.1925098601534048, |
|
"learning_rate": 4.4095142277236015e-06, |
|
"loss": 0.4432, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.5866523143164694, |
|
"grad_norm": 2.4671589997274452, |
|
"learning_rate": 4.362896116590475e-06, |
|
"loss": 0.431, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5893433799784715, |
|
"grad_norm": 2.342590603863471, |
|
"learning_rate": 4.316334236257818e-06, |
|
"loss": 0.4363, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.5920344456404736, |
|
"grad_norm": 2.1976368890683684, |
|
"learning_rate": 4.269832696278038e-06, |
|
"loss": 0.4383, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5947255113024758, |
|
"grad_norm": 2.28566900517168, |
|
"learning_rate": 4.223395600877912e-06, |
|
"loss": 0.4246, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.5974165769644779, |
|
"grad_norm": 2.3242280471032206, |
|
"learning_rate": 4.17702704859633e-06, |
|
"loss": 0.4364, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6001076426264801, |
|
"grad_norm": 2.471559272198895, |
|
"learning_rate": 4.130731131922574e-06, |
|
"loss": 0.426, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.6027987082884823, |
|
"grad_norm": 2.2329836528845135, |
|
"learning_rate": 4.0845119369350995e-06, |
|
"loss": 0.4291, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6054897739504844, |
|
"grad_norm": 2.330321196476834, |
|
"learning_rate": 4.038373542940905e-06, |
|
"loss": 0.4177, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.6081808396124866, |
|
"grad_norm": 2.214374848104633, |
|
"learning_rate": 3.992320022115492e-06, |
|
"loss": 0.4331, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6108719052744886, |
|
"grad_norm": 2.25904143882246, |
|
"learning_rate": 3.946355439143455e-06, |
|
"loss": 0.4328, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.6135629709364908, |
|
"grad_norm": 2.3404313849850333, |
|
"learning_rate": 3.900483850859735e-06, |
|
"loss": 0.4262, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.616254036598493, |
|
"grad_norm": 2.1929899817554426, |
|
"learning_rate": 3.854709305891557e-06, |
|
"loss": 0.4133, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.6189451022604952, |
|
"grad_norm": 2.1210538444123466, |
|
"learning_rate": 3.8090358443010993e-06, |
|
"loss": 0.4098, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6216361679224973, |
|
"grad_norm": 2.3023372298980536, |
|
"learning_rate": 3.7634674972289227e-06, |
|
"loss": 0.4319, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.6243272335844995, |
|
"grad_norm": 2.19595775993295, |
|
"learning_rate": 3.718008286538179e-06, |
|
"loss": 0.4227, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6270182992465017, |
|
"grad_norm": 2.228791736262618, |
|
"learning_rate": 3.67266222445964e-06, |
|
"loss": 0.4195, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.6297093649085038, |
|
"grad_norm": 2.2061824300529644, |
|
"learning_rate": 3.627433313237576e-06, |
|
"loss": 0.4135, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6324004305705059, |
|
"grad_norm": 2.3224307120244436, |
|
"learning_rate": 3.5823255447765233e-06, |
|
"loss": 0.421, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.635091496232508, |
|
"grad_norm": 2.2222499192405754, |
|
"learning_rate": 3.5373429002889583e-06, |
|
"loss": 0.4102, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.6377825618945102, |
|
"grad_norm": 2.1977097150554736, |
|
"learning_rate": 3.4924893499439096e-06, |
|
"loss": 0.4092, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.6404736275565124, |
|
"grad_norm": 2.308071327837962, |
|
"learning_rate": 3.447768852516554e-06, |
|
"loss": 0.4089, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.6431646932185145, |
|
"grad_norm": 2.488292604221286, |
|
"learning_rate": 3.4031853550388176e-06, |
|
"loss": 0.4051, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.6458557588805167, |
|
"grad_norm": 2.152365906777766, |
|
"learning_rate": 3.3587427924510086e-06, |
|
"loss": 0.42, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6485468245425189, |
|
"grad_norm": 2.3638622330762313, |
|
"learning_rate": 3.314445087254518e-06, |
|
"loss": 0.4116, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.6512378902045209, |
|
"grad_norm": 2.3651152931950556, |
|
"learning_rate": 3.2702961491656197e-06, |
|
"loss": 0.4073, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.6539289558665231, |
|
"grad_norm": 2.3400334398545604, |
|
"learning_rate": 3.226299874770402e-06, |
|
"loss": 0.4081, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.6566200215285253, |
|
"grad_norm": 2.4425548794489376, |
|
"learning_rate": 3.1824601471808504e-06, |
|
"loss": 0.4026, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.6593110871905274, |
|
"grad_norm": 2.1134533455735975, |
|
"learning_rate": 3.138780835692132e-06, |
|
"loss": 0.396, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.6620021528525296, |
|
"grad_norm": 2.2361134230955497, |
|
"learning_rate": 3.0952657954410792e-06, |
|
"loss": 0.4023, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6646932185145318, |
|
"grad_norm": 2.300054861176251, |
|
"learning_rate": 3.051918867065944e-06, |
|
"loss": 0.4028, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.667384284176534, |
|
"grad_norm": 2.2448074893564236, |
|
"learning_rate": 3.0087438763674226e-06, |
|
"loss": 0.4026, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.670075349838536, |
|
"grad_norm": 2.3641837901917397, |
|
"learning_rate": 2.9657446339709906e-06, |
|
"loss": 0.4054, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.6727664155005382, |
|
"grad_norm": 2.192051672724823, |
|
"learning_rate": 2.9229249349905686e-06, |
|
"loss": 0.4028, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6754574811625403, |
|
"grad_norm": 2.119933396606216, |
|
"learning_rate": 2.8802885586935794e-06, |
|
"loss": 0.4094, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.6781485468245425, |
|
"grad_norm": 2.2802712836694647, |
|
"learning_rate": 2.837839268167373e-06, |
|
"loss": 0.3971, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.6808396124865447, |
|
"grad_norm": 2.2055990856012975, |
|
"learning_rate": 2.7955808099871196e-06, |
|
"loss": 0.3963, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.6835306781485468, |
|
"grad_norm": 2.3222287842654734, |
|
"learning_rate": 2.7535169138851124e-06, |
|
"loss": 0.3877, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.686221743810549, |
|
"grad_norm": 2.4135369128813546, |
|
"learning_rate": 2.711651292421593e-06, |
|
"loss": 0.4048, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.6889128094725512, |
|
"grad_norm": 2.2731846505644757, |
|
"learning_rate": 2.6699876406570823e-06, |
|
"loss": 0.393, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6916038751345532, |
|
"grad_norm": 2.1145655786156223, |
|
"learning_rate": 2.62852963582625e-06, |
|
"loss": 0.3898, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.6942949407965554, |
|
"grad_norm": 2.156814305425583, |
|
"learning_rate": 2.5872809370133704e-06, |
|
"loss": 0.3975, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6969860064585576, |
|
"grad_norm": 2.17399225893929, |
|
"learning_rate": 2.5462451848293535e-06, |
|
"loss": 0.3954, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.6996770721205597, |
|
"grad_norm": 2.2783217042436403, |
|
"learning_rate": 2.5054260010904423e-06, |
|
"loss": 0.3907, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7023681377825619, |
|
"grad_norm": 2.1792138822114864, |
|
"learning_rate": 2.464826988498544e-06, |
|
"loss": 0.3836, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.7050592034445641, |
|
"grad_norm": 2.473989047844672, |
|
"learning_rate": 2.424451730323261e-06, |
|
"loss": 0.3927, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.7077502691065662, |
|
"grad_norm": 2.1752762929617795, |
|
"learning_rate": 2.3843037900856174e-06, |
|
"loss": 0.3857, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.7104413347685683, |
|
"grad_norm": 2.0528470041787314, |
|
"learning_rate": 2.3443867112435585e-06, |
|
"loss": 0.3898, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7131324004305705, |
|
"grad_norm": 2.241222494543154, |
|
"learning_rate": 2.304704016879195e-06, |
|
"loss": 0.3896, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.7158234660925726, |
|
"grad_norm": 2.285978004352459, |
|
"learning_rate": 2.265259209387867e-06, |
|
"loss": 0.3776, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.7185145317545748, |
|
"grad_norm": 2.2097751903596317, |
|
"learning_rate": 2.226055770169002e-06, |
|
"loss": 0.3864, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.721205597416577, |
|
"grad_norm": 2.203816223116715, |
|
"learning_rate": 2.1870971593188704e-06, |
|
"loss": 0.3894, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.7238966630785791, |
|
"grad_norm": 2.2001510451342723, |
|
"learning_rate": 2.148386815325179e-06, |
|
"loss": 0.3733, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.7265877287405813, |
|
"grad_norm": 2.4259755693530307, |
|
"learning_rate": 2.109928154763606e-06, |
|
"loss": 0.3864, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.7292787944025835, |
|
"grad_norm": 2.2522098918819995, |
|
"learning_rate": 2.0717245719962347e-06, |
|
"loss": 0.38, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.7319698600645855, |
|
"grad_norm": 2.244801331421332, |
|
"learning_rate": 2.0337794388719845e-06, |
|
"loss": 0.3757, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7346609257265877, |
|
"grad_norm": 2.14985806775455, |
|
"learning_rate": 1.9960961044290015e-06, |
|
"loss": 0.3709, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.7373519913885899, |
|
"grad_norm": 2.133997450184727, |
|
"learning_rate": 1.9586778945990785e-06, |
|
"loss": 0.3871, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.740043057050592, |
|
"grad_norm": 2.187211242307294, |
|
"learning_rate": 1.921528111914102e-06, |
|
"loss": 0.3921, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.7427341227125942, |
|
"grad_norm": 2.065818127506853, |
|
"learning_rate": 1.8846500352145753e-06, |
|
"loss": 0.3763, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.7454251883745964, |
|
"grad_norm": 2.279674062937155, |
|
"learning_rate": 1.848046919360225e-06, |
|
"loss": 0.3795, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.7481162540365985, |
|
"grad_norm": 2.110003155472943, |
|
"learning_rate": 1.811721994942731e-06, |
|
"loss": 0.3702, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.7508073196986006, |
|
"grad_norm": 2.0546799160456426, |
|
"learning_rate": 1.775678468000589e-06, |
|
"loss": 0.3632, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.7534983853606028, |
|
"grad_norm": 2.1355174912954213, |
|
"learning_rate": 1.7399195197361507e-06, |
|
"loss": 0.375, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7561894510226049, |
|
"grad_norm": 2.18931120813696, |
|
"learning_rate": 1.7044483062348465e-06, |
|
"loss": 0.3752, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.7588805166846071, |
|
"grad_norm": 2.245342573028182, |
|
"learning_rate": 1.6692679581866334e-06, |
|
"loss": 0.3812, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.7615715823466093, |
|
"grad_norm": 2.0518484711531513, |
|
"learning_rate": 1.6343815806096764e-06, |
|
"loss": 0.3713, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.7642626480086114, |
|
"grad_norm": 2.175221543045399, |
|
"learning_rate": 1.5997922525763015e-06, |
|
"loss": 0.3694, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.7669537136706136, |
|
"grad_norm": 2.215521340759257, |
|
"learning_rate": 1.5655030269412375e-06, |
|
"loss": 0.3603, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.7696447793326158, |
|
"grad_norm": 2.010052048626537, |
|
"learning_rate": 1.5315169300721694e-06, |
|
"loss": 0.3626, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.7723358449946178, |
|
"grad_norm": 2.18653300195142, |
|
"learning_rate": 1.4978369615826316e-06, |
|
"loss": 0.3731, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.77502691065662, |
|
"grad_norm": 1.9946035618643139, |
|
"learning_rate": 1.4644660940672628e-06, |
|
"loss": 0.3774, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.7777179763186222, |
|
"grad_norm": 2.2272039442093026, |
|
"learning_rate": 1.431407272839443e-06, |
|
"loss": 0.3655, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.7804090419806243, |
|
"grad_norm": 2.1332605024963964, |
|
"learning_rate": 1.3986634156713418e-06, |
|
"loss": 0.3755, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7831001076426265, |
|
"grad_norm": 2.065655707292035, |
|
"learning_rate": 1.3662374125363954e-06, |
|
"loss": 0.367, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.7857911733046287, |
|
"grad_norm": 2.172706147815118, |
|
"learning_rate": 1.334132125354236e-06, |
|
"loss": 0.3685, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.7884822389666308, |
|
"grad_norm": 2.3789895501877196, |
|
"learning_rate": 1.302350387738101e-06, |
|
"loss": 0.3703, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.7911733046286329, |
|
"grad_norm": 2.163353498298291, |
|
"learning_rate": 1.270895004744737e-06, |
|
"loss": 0.369, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.7938643702906351, |
|
"grad_norm": 2.010979175475219, |
|
"learning_rate": 1.2397687526268248e-06, |
|
"loss": 0.3521, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.7965554359526372, |
|
"grad_norm": 2.2070950181380566, |
|
"learning_rate": 1.2089743785879493e-06, |
|
"loss": 0.3621, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.7992465016146394, |
|
"grad_norm": 2.2001045079137467, |
|
"learning_rate": 1.1785146005401292e-06, |
|
"loss": 0.3733, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.8019375672766416, |
|
"grad_norm": 2.001155529188707, |
|
"learning_rate": 1.1483921068639353e-06, |
|
"loss": 0.3632, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.8046286329386437, |
|
"grad_norm": 2.154399208032811, |
|
"learning_rate": 1.118609556171213e-06, |
|
"loss": 0.3708, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.8073196986006459, |
|
"grad_norm": 1.9221421559210652, |
|
"learning_rate": 1.0891695770704341e-06, |
|
"loss": 0.355, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.810010764262648, |
|
"grad_norm": 1.9509787136447798, |
|
"learning_rate": 1.0600747679346956e-06, |
|
"loss": 0.3467, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.8127018299246501, |
|
"grad_norm": 2.103491892859654, |
|
"learning_rate": 1.0313276966723867e-06, |
|
"loss": 0.37, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.8153928955866523, |
|
"grad_norm": 1.9409728720505934, |
|
"learning_rate": 1.002930900500546e-06, |
|
"loss": 0.3522, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.8180839612486545, |
|
"grad_norm": 1.949276176991141, |
|
"learning_rate": 9.74886885720925e-07, |
|
"loss": 0.3619, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.8207750269106566, |
|
"grad_norm": 1.9562682681398276, |
|
"learning_rate": 9.471981274987846e-07, |
|
"loss": 0.3464, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.8234660925726588, |
|
"grad_norm": 2.1766498772006155, |
|
"learning_rate": 9.198670696444339e-07, |
|
"loss": 0.3623, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.826157158234661, |
|
"grad_norm": 2.423750273252814, |
|
"learning_rate": 8.928961243975437e-07, |
|
"loss": 0.3621, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.8288482238966631, |
|
"grad_norm": 2.049645597913739, |
|
"learning_rate": 8.662876722142327e-07, |
|
"loss": 0.3516, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.8315392895586652, |
|
"grad_norm": 2.092144600157003, |
|
"learning_rate": 8.400440615569849e-07, |
|
"loss": 0.3561, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.8342303552206674, |
|
"grad_norm": 2.0626372120014063, |
|
"learning_rate": 8.141676086873574e-07, |
|
"loss": 0.3623, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.8369214208826695, |
|
"grad_norm": 2.35473646860873, |
|
"learning_rate": 7.886605974615574e-07, |
|
"loss": 0.3613, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.8396124865446717, |
|
"grad_norm": 2.156694991140723, |
|
"learning_rate": 7.635252791288611e-07, |
|
"loss": 0.3466, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.8423035522066739, |
|
"grad_norm": 1.9579670944342409, |
|
"learning_rate": 7.38763872132931e-07, |
|
"loss": 0.348, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.844994617868676, |
|
"grad_norm": 2.0667736004944137, |
|
"learning_rate": 7.143785619160026e-07, |
|
"loss": 0.3481, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.8476856835306782, |
|
"grad_norm": 2.3663305744973044, |
|
"learning_rate": 6.903715007260043e-07, |
|
"loss": 0.3548, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.8503767491926802, |
|
"grad_norm": 1.9836581746638566, |
|
"learning_rate": 6.667448074265954e-07, |
|
"loss": 0.3482, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.8530678148546824, |
|
"grad_norm": 1.9785157074619235, |
|
"learning_rate": 6.435005673101646e-07, |
|
"loss": 0.3511, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.8557588805166846, |
|
"grad_norm": 2.020013708865099, |
|
"learning_rate": 6.206408319137703e-07, |
|
"loss": 0.3427, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.8584499461786868, |
|
"grad_norm": 2.034476487333608, |
|
"learning_rate": 5.981676188380802e-07, |
|
"loss": 0.3562, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.8611410118406889, |
|
"grad_norm": 2.0354731199908214, |
|
"learning_rate": 5.760829115692907e-07, |
|
"loss": 0.3484, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8638320775026911, |
|
"grad_norm": 2.178345640449028, |
|
"learning_rate": 5.543886593040737e-07, |
|
"loss": 0.3473, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.8665231431646933, |
|
"grad_norm": 2.105076271550617, |
|
"learning_rate": 5.330867767775333e-07, |
|
"loss": 0.3465, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.8692142088266954, |
|
"grad_norm": 2.1543768721065844, |
|
"learning_rate": 5.121791440942131e-07, |
|
"loss": 0.3479, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.8719052744886975, |
|
"grad_norm": 2.158400755663796, |
|
"learning_rate": 4.916676065621562e-07, |
|
"loss": 0.3586, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.8745963401506996, |
|
"grad_norm": 2.0683369029384497, |
|
"learning_rate": 4.715539745300429e-07, |
|
"loss": 0.361, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.8772874058127018, |
|
"grad_norm": 2.0446823246419505, |
|
"learning_rate": 4.5184002322740784e-07, |
|
"loss": 0.3547, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.879978471474704, |
|
"grad_norm": 1.9815928231734286, |
|
"learning_rate": 4.3252749260795533e-07, |
|
"loss": 0.3563, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.8826695371367062, |
|
"grad_norm": 2.0032524155103957, |
|
"learning_rate": 4.1361808719599163e-07, |
|
"loss": 0.3544, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.8853606027987083, |
|
"grad_norm": 1.8700071761196195, |
|
"learning_rate": 3.951134759359854e-07, |
|
"loss": 0.3423, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.8880516684607105, |
|
"grad_norm": 2.205223972755586, |
|
"learning_rate": 3.7701529204526856e-07, |
|
"loss": 0.3374, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8907427341227125, |
|
"grad_norm": 2.1069260105110357, |
|
"learning_rate": 3.5932513286988436e-07, |
|
"loss": 0.358, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.8934337997847147, |
|
"grad_norm": 2.0793947298202857, |
|
"learning_rate": 3.420445597436056e-07, |
|
"loss": 0.3466, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.8961248654467169, |
|
"grad_norm": 2.354227369972549, |
|
"learning_rate": 3.251750978501339e-07, |
|
"loss": 0.3386, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.898815931108719, |
|
"grad_norm": 1.9498999228151335, |
|
"learning_rate": 3.087182360884872e-07, |
|
"loss": 0.3463, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.9015069967707212, |
|
"grad_norm": 2.051978527036425, |
|
"learning_rate": 2.926754269415877e-07, |
|
"loss": 0.3465, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.9041980624327234, |
|
"grad_norm": 1.9319927347974046, |
|
"learning_rate": 2.77048086348064e-07, |
|
"loss": 0.3435, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.9068891280947255, |
|
"grad_norm": 2.000654717396208, |
|
"learning_rate": 2.6183759357728543e-07, |
|
"loss": 0.3478, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.9095801937567277, |
|
"grad_norm": 2.0939867990033574, |
|
"learning_rate": 2.470452911076227e-07, |
|
"loss": 0.3323, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.9122712594187298, |
|
"grad_norm": 2.0560991566205584, |
|
"learning_rate": 2.326724845079653e-07, |
|
"loss": 0.3576, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.9149623250807319, |
|
"grad_norm": 2.0115882952141915, |
|
"learning_rate": 2.1872044232248646e-07, |
|
"loss": 0.3534, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9176533907427341, |
|
"grad_norm": 1.9266766405004774, |
|
"learning_rate": 2.0519039595868706e-07, |
|
"loss": 0.3532, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.9203444564047363, |
|
"grad_norm": 1.9314452273068792, |
|
"learning_rate": 1.9208353957870684e-07, |
|
"loss": 0.3572, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.9230355220667384, |
|
"grad_norm": 1.9735976195136693, |
|
"learning_rate": 1.7940102999393194e-07, |
|
"loss": 0.3416, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.9257265877287406, |
|
"grad_norm": 1.8501312984088143, |
|
"learning_rate": 1.6714398656289154e-07, |
|
"loss": 0.3319, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.9284176533907428, |
|
"grad_norm": 2.0192846857060633, |
|
"learning_rate": 1.5531349109246364e-07, |
|
"loss": 0.3526, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.9311087190527448, |
|
"grad_norm": 2.192842156423193, |
|
"learning_rate": 1.439105877423963e-07, |
|
"loss": 0.3394, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.933799784714747, |
|
"grad_norm": 2.284759597306071, |
|
"learning_rate": 1.3293628293314876e-07, |
|
"loss": 0.3556, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.9364908503767492, |
|
"grad_norm": 2.0728951089806604, |
|
"learning_rate": 1.223915452570651e-07, |
|
"loss": 0.3488, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.9391819160387513, |
|
"grad_norm": 2.127387782252794, |
|
"learning_rate": 1.1227730539288717e-07, |
|
"loss": 0.3458, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.9418729817007535, |
|
"grad_norm": 2.1743344874964152, |
|
"learning_rate": 1.0259445602361084e-07, |
|
"loss": 0.3448, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.9445640473627557, |
|
"grad_norm": 1.9486192237772186, |
|
"learning_rate": 9.334385175769955e-08, |
|
"loss": 0.3497, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.9472551130247578, |
|
"grad_norm": 2.031170994409377, |
|
"learning_rate": 8.452630905365633e-08, |
|
"loss": 0.3422, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.94994617868676, |
|
"grad_norm": 2.0123125989553223, |
|
"learning_rate": 7.614260614796143e-08, |
|
"loss": 0.3384, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.9526372443487621, |
|
"grad_norm": 1.921930000860114, |
|
"learning_rate": 6.819348298638839e-08, |
|
"loss": 0.3464, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.9553283100107642, |
|
"grad_norm": 2.040424462430464, |
|
"learning_rate": 6.067964115869297e-08, |
|
"loss": 0.3446, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.9580193756727664, |
|
"grad_norm": 2.008560647062215, |
|
"learning_rate": 5.36017438366937e-08, |
|
"loss": 0.3435, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.9607104413347686, |
|
"grad_norm": 1.998392701697555, |
|
"learning_rate": 4.696041571573773e-08, |
|
"loss": 0.3437, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.9634015069967707, |
|
"grad_norm": 1.9335547030262188, |
|
"learning_rate": 4.0756242959567596e-08, |
|
"loss": 0.3445, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.9660925726587729, |
|
"grad_norm": 2.1814630474711496, |
|
"learning_rate": 3.498977314858487e-08, |
|
"loss": 0.3464, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.9687836383207751, |
|
"grad_norm": 1.9036442188692122, |
|
"learning_rate": 2.96615152315205e-08, |
|
"loss": 0.3391, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.9714747039827771, |
|
"grad_norm": 2.0417825783497046, |
|
"learning_rate": 2.4771939480516817e-08, |
|
"loss": 0.3443, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.9741657696447793, |
|
"grad_norm": 2.097798597059078, |
|
"learning_rate": 2.0321477449619098e-08, |
|
"loss": 0.3464, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.9768568353067815, |
|
"grad_norm": 2.082332696428834, |
|
"learning_rate": 1.6310521936688806e-08, |
|
"loss": 0.3516, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.9795479009687836, |
|
"grad_norm": 2.043985417106578, |
|
"learning_rate": 1.2739426948732426e-08, |
|
"loss": 0.337, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.9822389666307858, |
|
"grad_norm": 2.0493614783494256, |
|
"learning_rate": 9.608507670659239e-09, |
|
"loss": 0.3402, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.984930032292788, |
|
"grad_norm": 2.0501039577982874, |
|
"learning_rate": 6.918040437463025e-09, |
|
"loss": 0.3429, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.9876210979547901, |
|
"grad_norm": 2.0871992562422963, |
|
"learning_rate": 4.668262709830451e-09, |
|
"loss": 0.331, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.9903121636167922, |
|
"grad_norm": 2.182084159737187, |
|
"learning_rate": 2.8593730531861764e-09, |
|
"loss": 0.3504, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.9930032292787944, |
|
"grad_norm": 2.0603029829015314, |
|
"learning_rate": 1.4915311201635362e-09, |
|
"loss": 0.3417, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.9956942949407965, |
|
"grad_norm": 2.1347005430099935, |
|
"learning_rate": 5.648576365169245e-10, |
|
"loss": 0.3381, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9983853606027987, |
|
"grad_norm": 2.0300038148401574, |
|
"learning_rate": 7.943439046531609e-11, |
|
"loss": 0.3392, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_runtime": 3.3821, |
|
"eval_samples_per_second": 2.957, |
|
"eval_steps_per_second": 0.887, |
|
"step": 1858 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 1858, |
|
"total_flos": 194513700126720.0, |
|
"train_loss": 0.5105859521772428, |
|
"train_runtime": 16588.6757, |
|
"train_samples_per_second": 1.791, |
|
"train_steps_per_second": 0.112 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1858, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 194513700126720.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|